diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py index 36c95852452ac..7820fbda803d7 100644 --- a/.ci/generate_test_report_lib.py +++ b/.ci/generate_test_report_lib.py @@ -41,10 +41,12 @@ def _parse_ninja_log(ninja_log: list[str]) -> list[tuple[str, str]]: # touch test/4.stamp # # index will point to the line that starts with Failed:. The progress - # indicator is the line before this ([4/5] test/4.stamp) and contains a pretty - # printed version of the target being built (test/4.stamp). We use this line - # and remove the progress information to get a succinct name for the target. - failing_action = ninja_log[index - 1].split("] ")[1] + # indicator is sometimes the line before this ([4/5] test/4.stamp) and + # will contain a pretty printed version of the target being built + # (test/4.stamp) when accurate. We instead parse the failed line rather + # than the progress indicator as the progress indicator may not be + # aligned with the failure. + failing_action = ninja_log[index].split("FAILED: ")[1] failure_log = [] while ( index < len(ninja_log) diff --git a/.ci/generate_test_report_lib_test.py b/.ci/generate_test_report_lib_test.py index 431e10da6405a..4068a3b7300a4 100644 --- a/.ci/generate_test_report_lib_test.py +++ b/.ci/generate_test_report_lib_test.py @@ -39,7 +39,7 @@ def test_find_failure_ninja_logs(self): self.assertEqual( failures[0], ( - "test/4.stamp", + "touch test/4.stamp", dedent( """\ FAILED: touch test/4.stamp @@ -77,7 +77,7 @@ def test_ninja_log_end(self): self.assertEqual( failures[0], ( - "test/3.stamp", + "touch test/3.stamp", dedent( """\ FAILED: touch test/3.stamp @@ -106,7 +106,7 @@ def test_ninja_log_multiple_failures(self): self.assertEqual( failures[0], ( - "test/2.stamp", + "touch test/2.stamp", dedent( """\ FAILED: touch test/2.stamp @@ -117,7 +117,7 @@ def test_ninja_log_multiple_failures(self): self.assertEqual( failures[1], ( - "test/4.stamp", + "touch test/4.stamp", dedent( """\ FAILED: touch test/4.stamp @@ -150,7 +150,7 @@ def test_ninja_log_runtimes_failure(self): self.assertEqual( failures[0], ( - "test/2.stamp", + "touch test/2.stamp", dedent( """\ FAILED: touch test/2.stamp @@ -159,6 +159,34 @@ def test_ninja_log_runtimes_failure(self): ), ) + # Test that we correctly handle cases where the FAILED: line does not + # match up with the progress indicator. + def test_ninja_log_mismatched_failed(self): + failures = generate_test_report_lib.find_failure_in_ninja_logs( + [ + [ + "[1/5] test/1.stamp", + "[2/5] test/2.stamp", + "ModuleNotFoundError: No module named 'mount_langley'", + "FAILED: tools/check-langley", + "Wow! This system is really broken!", + "[5/5] test/5.stamp", + ] + ] + ) + self.assertEqual(len(failures), 1) + self.assertEqual( + failures[0], + ( + "tools/check-langley", + dedent( + """\ + FAILED: tools/check-langley + Wow! This system is really broken!""" + ), + ), + ) + def test_title_only(self): self.assertEqual( generate_test_report_lib.generate_report("Foo", 0, [], []), @@ -448,7 +476,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self): All tests passed but another part of the build **failed**. Click on a failure below to see the details.
- test/2.stamp + touch test/2.stamp ``` FAILED: touch test/2.stamp @@ -456,7 +484,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self): ```
- test/4.stamp + touch test/4.stamp ``` FAILED: touch test/4.stamp diff --git a/.github/workflows/build-ci-container-tooling.yml b/.github/workflows/build-ci-container-tooling.yml index c77c78617666d..992947eb2fffb 100644 --- a/.github/workflows/build-ci-container-tooling.yml +++ b/.github/workflows/build-ci-container-tooling.yml @@ -63,7 +63,7 @@ jobs: podman save ${{ steps.vars.outputs.container-name-lint-tag }} > ${{ steps.vars.outputs.container-lint-filename }} - name: Upload container image - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: container-amd64 path: "*.tar" diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml index f18a69c192ee9..6ecad5536109b 100644 --- a/.github/workflows/check-ci.yml +++ b/.github/workflows/check-ci.yml @@ -28,7 +28,7 @@ jobs: - name: Setup Python uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: 3.13 + python-version: 3.14 cache: 'pip' - name: Install Python Dependencies run: | diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b5f3413fe3b6b..7374777cb759c 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -97,7 +97,7 @@ jobs: - name: Setup Python env uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: '3.13' + python-version: '3.14' cache: 'pip' cache-dependency-path: 'llvm/docs/requirements-hashed.txt' - name: Install python dependencies diff --git a/.github/workflows/gha-codeql.yml b/.github/workflows/gha-codeql.yml index 63388ebc706bd..6d490ca2c4b29 100644 --- a/.github/workflows/gha-codeql.yml +++ b/.github/workflows/gha-codeql.yml @@ -29,9 +29,9 @@ jobs: sparse-checkout: | .github/ - name: Initialize CodeQL - uses: github/codeql-action/init@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4 + uses: github/codeql-action/init@5d5cd550d3e189c569da8f16ea8de2d821c9bf7a # v3.31.2 with: languages: actions queries: security-extended - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4 + uses: github/codeql-action/analyze@5d5cd550d3e189c569da8f16ea8de2d821c9bf7a # v3.31.2 diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml index dcb852312d41a..ce6ccfa23df6a 100644 --- a/.github/workflows/hlsl-test-all.yaml +++ b/.github/workflows/hlsl-test-all.yaml @@ -54,7 +54,7 @@ jobs: path: golden-images - name: Setup Windows if: runner.os == 'Windows' - uses: llvm/actions/setup-windows@main + uses: llvm/actions/setup-windows@42d80571b13f4599bbefbc7189728b64723c7f78 # main with: arch: amd64 - name: Build DXC @@ -80,7 +80,7 @@ jobs: ninja check-hlsl-unit ninja ${{ inputs.TestTarget }} - name: Publish Test Results - uses: EnricoMi/publish-unit-test-result-action/macos@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0 + uses: EnricoMi/publish-unit-test-result-action/macos@34d7c956a59aed1bfebf31df77b8de55db9bbaaf # v2.21.0 if: always() && runner.os == 'macOS' with: comment_mode: off diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml index 5ccf976848197..432c45744abda 100644 --- a/.github/workflows/libclang-abi-tests.yml +++ b/.github/workflows/libclang-abi-tests.yml @@ -100,7 +100,7 @@ jobs: repo: ${{ github.repository }} steps: - name: Install Ninja - uses: llvm/actions/install-ninja@main + uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - name: Install abi-compliance-checker run: | sudo apt-get update diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml index 312cb47fc3d93..4bce86145fc0c 100644 --- a/.github/workflows/libcxx-build-containers.yml +++ b/.github/workflows/libcxx-build-containers.yml @@ -55,7 +55,7 @@ jobs: TAG: ${{ github.sha }} - name: Log in to GitHub Container Registry - uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0 + uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0 with: registry: ghcr.io username: ${{ github.actor }} diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml index 9e8f55859fc7a..e2ca940d2f0b3 100644 --- a/.github/workflows/libcxx-run-benchmarks.yml +++ b/.github/workflows/libcxx-run-benchmarks.yml @@ -35,7 +35,7 @@ jobs: steps: - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: - python-version: '3.13' + python-version: '3.14' - name: Extract information from the PR id: vars diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml index f73d180bb0005..961f1cc79389d 100644 --- a/.github/workflows/llvm-abi-tests.yml +++ b/.github/workflows/llvm-abi-tests.yml @@ -88,7 +88,7 @@ jobs: repo: ${{ github.repository }} steps: - name: Install Ninja - uses: llvm/actions/install-ninja@main + uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - name: Install abi-compliance-checker run: | sudo apt-get update diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml index 7d42abfadde7b..3274f1adf9e61 100644 --- a/.github/workflows/llvm-bugs.yml +++ b/.github/workflows/llvm-bugs.yml @@ -39,6 +39,12 @@ jobs: repo: context.repo.repo }) .then((issue) => { + var maybeTruncatedBody = issue.data.body; + if (maybeTruncatedBody.length > 15000) { + maybeTruncatedBody = maybeTruncatedBody.substring(0, + 15000) + + "Please see the issue for the entire body." + } const payload = { author : issue.data.user.login, issue : issue.data.number, @@ -46,7 +52,7 @@ jobs: url : issue.data.html_url, labels : issue.data.labels.map((label) => label.name), assignee : issue.data.assignees.map((assignee) => assignee.login), - body : issue.data.body + body : maybeTruncatedBody }; const data = { diff --git a/.github/workflows/new-issues.yml b/.github/workflows/new-issues.yml index 8480a657cc717..a5dcad28dbe24 100644 --- a/.github/workflows/new-issues.yml +++ b/.github/workflows/new-issues.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-24.04 if: github.repository == 'llvm/llvm-project' steps: - - uses: llvm/actions/issue-labeler@main + - uses: llvm/actions/issue-labeler@42d80571b13f4599bbefbc7189728b64723c7f78 # main with: repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }} configuration-path: .github/new-issues-labeler.yml diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml index 6303a119750b5..973d3abf358ce 100644 --- a/.github/workflows/premerge.yaml +++ b/.github/workflows/premerge.yaml @@ -190,7 +190,7 @@ jobs: with: max-size: "2000M" - name: Install Ninja - uses: llvm/actions/install-ninja@main + uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - name: Build and Test run: | source <(git diff --name-only HEAD~1...HEAD | python3 .ci/compute_projects.py) diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index fa73b9d9fe8d0..25f426b7814df 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -68,7 +68,7 @@ jobs: # due to https://github.com/actions/runner-images/issues/10385 - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: - python-version: '3.13' + python-version: '3.14' - name: Checkout LLVM uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -138,7 +138,6 @@ jobs: target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF" fi - echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT case "${{ inputs.runs-on }}" in ubuntu-22.04*) build_runs_on="depot-${{ inputs.runs-on }}-16" @@ -157,6 +156,23 @@ jobs: build_runs_on=$test_runs_on ;; esac + + case "$build_runs_on" in + # These runners cannot build the full release package faster than + # the 6 hours timeout limit, so we need to use a configuration + # that builds more quickly. + macos-14) + bootstrap_prefix="BOOTSTRAP" + target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF -DLLVM_RELEASE_ENABLE_PGO=OFF" + ;; + *) + bootstrap_prefix="BOOTSTRAP_BOOTSTRAP" + ;; + esac + + target_cmake_flags="$target_cmake_flags -D${bootstrap_prefix}_CPACK_PACKAGE_FILE_NAME=$release_binary_basename" + + echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT echo "build-runs-on=$build_runs_on" >> $GITHUB_OUTPUT echo "test-runs-on=$test_runs_on" >> $GITHUB_OUTPUT @@ -173,11 +189,11 @@ jobs: ref: ${{ needs.prepare.outputs.ref }} - name: Install Ninja - uses: llvm/actions/install-ninja@a1ea791b03c8e61f53a0e66f2f73db283aa0f01e # main + uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - name: Setup Windows if: startsWith(runner.os, 'Windows') - uses: llvm/actions/setup-windows@main + uses: llvm/actions/setup-windows@42d80571b13f4599bbefbc7189728b64723c7f78 # main with: arch: amd64 @@ -200,8 +216,7 @@ jobs: # so we need to set some extra cmake flags to disable this. cmake -G Ninja -S llvm -B ${{ steps.setup-stage.outputs.build-prefix }}/build \ ${{ needs.prepare.outputs.target-cmake-flags }} \ - -C clang/cmake/caches/Release.cmake \ - -DBOOTSTRAP_BOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}" + -C clang/cmake/caches/Release.cmake - name: Build shell: bash diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index c07df338cf989..bd3277a8b452c 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -36,7 +36,7 @@ jobs: persist-credentials: false - name: "Run analysis" - uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2 + uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3 with: results_file: results.sarif results_format: sarif diff --git a/bolt/README.md b/bolt/README.md index 902d1eb6e7694..55f742c5019f5 100644 --- a/bolt/README.md +++ b/bolt/README.md @@ -173,7 +173,7 @@ Once you have `perf.fdata` ready, you can use it for optimizations with BOLT. Assuming your environment is setup to include the right path, execute `llvm-bolt`: ``` -$ llvm-bolt -o .bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=hfsort -split-functions -split-all-cold -split-eh -dyno-stats +$ llvm-bolt -o .bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions -split-all-cold -split-eh -dyno-stats ``` If you do need an updated debug info, then add `-update-debug-sections` option diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md index 43ceceee7de45..7c6e01d669b74 100644 --- a/bolt/docs/CommandLineArgumentReference.md +++ b/bolt/docs/CommandLineArgumentReference.md @@ -381,11 +381,6 @@ Set verbosity level for diagnostic output -- `--write-dwp` - - Output a single dwarf package file (dwp) instead of multiple non-relocatable - dwarf object files (dwo). - ### BOLT optimization options: - `--align-blocks` diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index d666c10885ad5..5e349cd69fb43 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -840,6 +840,16 @@ class MCPlusBuilder { return false; } + virtual bool isLDRWl(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + + virtual bool isLDRXl(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + virtual bool isMOVW(const MCInst &Inst) const { llvm_unreachable("not implemented"); return false; @@ -1789,6 +1799,19 @@ class MCPlusBuilder { llvm_unreachable("not implemented"); } + /// Take \p LDRInst and return ADRP+LDR instruction sequence - for + /// + /// ldr x0, [label] + /// + /// the following sequence will be generated: + /// + /// adrp x0, PageBase(label) + /// ldr x0, [x0, PageOffset(label)] + virtual InstructionListType createAdrpLdr(const MCInst &LDRInst, + MCContext *Ctx) const { + llvm_unreachable("not implemented"); + } + /// Return not 0 if the instruction CurInst, in combination with the recent /// history of disassembled instructions supplied by [Begin, End), is a linker /// generated veneer/stub that needs patching. This happens in AArch64 when diff --git a/bolt/include/bolt/Passes/ADRRelaxationPass.h b/bolt/include/bolt/Passes/AArch64RelaxationPass.h similarity index 51% rename from bolt/include/bolt/Passes/ADRRelaxationPass.h rename to bolt/include/bolt/Passes/AArch64RelaxationPass.h index b9f92dec7f03b..b9185a1e34388 100644 --- a/bolt/include/bolt/Passes/ADRRelaxationPass.h +++ b/bolt/include/bolt/Passes/AArch64RelaxationPass.h @@ -1,4 +1,4 @@ -//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===// +//===- bolt/Passes/AArch64RelaxationPass.h ----------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,29 +6,29 @@ // //===----------------------------------------------------------------------===// // -// This file declares the ADRRelaxationPass class, which replaces AArch64 -// non-local ADR instructions with ADRP + ADD due to small offset range of ADR -// instruction (+- 1MB) which could be easily overflowed after BOLT -// optimizations. Such problems are usually connected with errata 843419 -// https://developer.arm.com/documentation/epm048406/2100/ +// This file declares the AArch64RelaxationPass class, which replaces AArch64 +// non-local ADR/LDR instructions with ADRP + ADD/LDR due to small offset +// range of ADR and LDR instruction (+- 1MB) which could be easily overflowed +// after BOLT optimizations. Such problems are usually connected with errata +// 843419: https://developer.arm.com/documentation/epm048406/2100/ // The linker could replace ADRP instruction with ADR in some cases. // //===----------------------------------------------------------------------===// -#ifndef BOLT_PASSES_ADRRELAXATIONPASS_H -#define BOLT_PASSES_ADRRELAXATIONPASS_H +#ifndef BOLT_PASSES_AARCH64RELAXATIONPASS_H +#define BOLT_PASSES_AARCH64RELAXATIONPASS_H #include "bolt/Passes/BinaryPasses.h" namespace llvm { namespace bolt { -class ADRRelaxationPass : public BinaryFunctionPass { +class AArch64RelaxationPass : public BinaryFunctionPass { public: - explicit ADRRelaxationPass(const cl::opt &PrintPass) + explicit AArch64RelaxationPass(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) {} - const char *getName() const override { return "adr-relaxation"; } + const char *getName() const override { return "aarch64-relaxation"; } /// Pass entry point Error runOnFunctions(BinaryContext &BC) override; diff --git a/bolt/include/bolt/Passes/FixRelaxationPass.h b/bolt/include/bolt/Passes/FixRelaxationPass.h index 50b64480aa62e..cf5a8a1fcb134 100644 --- a/bolt/include/bolt/Passes/FixRelaxationPass.h +++ b/bolt/include/bolt/Passes/FixRelaxationPass.h @@ -1,4 +1,4 @@ -//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===// +//===- bolt/Passes/FixRelaxationPass.h --------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index a383ced1712e3..7af32c8c56635 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -78,6 +78,11 @@ cl::opt CompDirOverride( "to *.dwo files."), cl::Hidden, cl::init(""), cl::cat(BoltCategory)); +static cl::opt CloneConstantIsland("clone-constant-island", + cl::desc("clone constant islands"), + cl::Hidden, cl::init(true), + cl::ZeroOrMore, cl::cat(BoltCategory)); + static cl::opt FailOnInvalidPadding("fail-on-invalid-padding", cl::Hidden, cl::init(false), cl::desc("treat invalid code padding as error"), @@ -461,7 +466,8 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF, // of dynamic relocs, as we currently do not support cloning them. // Notice: we might fail to link because of this, if the original constant // island we are referring would be emitted too far away. - if (IslandIter->second->hasDynamicRelocationAtIsland()) { + if (IslandIter->second->hasDynamicRelocationAtIsland() || + !opts::CloneConstantIsland) { MCSymbol *IslandSym = IslandIter->second->getOrCreateIslandAccess(Address); if (IslandSym) @@ -469,6 +475,12 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF, } else if (MCSymbol *IslandSym = IslandIter->second->getOrCreateProxyIslandAccess(Address, BF)) { + LLVM_DEBUG( + dbgs() << "BOLT-DEBUG: clone constant island at address 0x" + << Twine::utohexstr(IslandIter->first) << " with size of 0x" + << Twine::utohexstr( + IslandIter->second->estimateConstantIslandSize()) + << " bytes, referenced by " << BF << "\n"); BF.createIslandDependency(IslandSym, IslandIter->second); return std::make_pair(IslandSym, 0); } @@ -778,13 +790,17 @@ void BinaryContext::populateJumpTables() { } if (opts::StrictMode && DataPCRelocations.size()) { - LLVM_DEBUG({ - dbgs() << DataPCRelocations.size() - << " unclaimed PC-relative relocations left in data:\n"; - for (uint64_t Reloc : DataPCRelocations) - dbgs() << Twine::utohexstr(Reloc) << '\n'; - }); - assert(0 && "unclaimed PC-relative relocations left in data\n"); + this->errs() << "BOLT-ERROR: " << DataPCRelocations.size() + << " unclaimed PC-relative relocation(s) left in data"; + if (opts::Verbosity) { + this->errs() << ":\n"; + for (uint64_t RelocOffset : DataPCRelocations) + this->errs() << " @0x" << Twine::utohexstr(RelocOffset) << '\n'; + } else { + this->errs() << ". Re-run with -v=1 to see the list\n"; + } + this->errs() << "BOLT-ERROR: unable to proceed with --strict\n"; + exit(1); } clearList(DataPCRelocations); } diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/AArch64RelaxationPass.cpp similarity index 67% rename from bolt/lib/Passes/ADRRelaxationPass.cpp rename to bolt/lib/Passes/AArch64RelaxationPass.cpp index c3954c94a7f92..610adad58cfcb 100644 --- a/bolt/lib/Passes/ADRRelaxationPass.cpp +++ b/bolt/lib/Passes/AArch64RelaxationPass.cpp @@ -1,4 +1,4 @@ -//===- bolt/Passes/ADRRelaxationPass.cpp ----------------------------------===// +//===- bolt/Passes/AArch64RelaxationPass.cpp ------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,11 +6,11 @@ // //===----------------------------------------------------------------------===// // -// This file implements the ADRRelaxationPass class. +// This file implements the AArch64RelaxationPass class. // //===----------------------------------------------------------------------===// -#include "bolt/Passes/ADRRelaxationPass.h" +#include "bolt/Passes/AArch64RelaxationPass.h" #include "bolt/Core/ParallelUtilities.h" #include "bolt/Utils/CommandLineOpts.h" #include @@ -20,10 +20,10 @@ using namespace llvm; namespace opts { extern cl::OptionCategory BoltCategory; -static cl::opt - AdrPassOpt("adr-relaxation", - cl::desc("Replace ARM non-local ADR instructions with ADRP"), - cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden); +static cl::opt AArch64PassOpt( + "aarch64-relaxation", + cl::desc("Replace ARM non-local ADR/LDR instructions with ADRP"), + cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden); } // namespace opts namespace llvm { @@ -35,7 +35,7 @@ namespace bolt { // jobs and checking the exit flag after it. static bool PassFailed = false; -void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { +void AArch64RelaxationPass::runOnFunction(BinaryFunction &BF) { if (PassFailed) return; @@ -43,10 +43,13 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { for (BinaryBasicBlock &BB : BF) { for (auto It = BB.begin(); It != BB.end(); ++It) { MCInst &Inst = *It; - if (!BC.MIB->isADR(Inst)) + bool IsADR = BC.MIB->isADR(Inst); + + // TODO: Handle other types of LDR (literal, PC-relative) instructions. + if (!IsADR && !BC.MIB->isLDRXl(Inst) && !BC.MIB->isLDRWl(Inst)) continue; - const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst); + const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst, IsADR ? 0 : 1); if (!Symbol) continue; @@ -56,25 +59,27 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { continue; } - // Don't relax ADR if it points to the same function and is in the main - // fragment and BF initial size is < 1MB. + // Don't relax ADR/LDR if it points to the same function and is in the + // main fragment and BF initial size is < 1MB. const unsigned OneMB = 0x100000; if (BF.getSize() < OneMB) { BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol); if (TargetBF == &BF && !BB.isSplit()) continue; - // No relaxation needed if ADR references a basic block in the same + // No relaxation needed if ADR/LDR references a basic block in the same // fragment. if (BinaryBasicBlock *TargetBB = BF.getBasicBlockForLabel(Symbol)) if (BB.getFragmentNum() == TargetBB->getFragmentNum()) continue; } - InstructionListType AdrpAdd; + InstructionListType AdrpMaterialization; { auto L = BC.scopeLock(); - AdrpAdd = BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get()); + AdrpMaterialization = + IsADR ? BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get()) + : BC.MIB->createAdrpLdr(Inst, BC.Ctx.get()); } if (It != BB.begin() && BC.MIB->isNoop(*std::prev(It))) { @@ -88,18 +93,18 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) { // invalidate this offset, so we have to rely on linker-inserted NOP to // replace it with ADRP, and abort if it is not present. auto L = BC.scopeLock(); - BC.errs() << "BOLT-ERROR: cannot relax ADR in non-simple function " - << BF << '\n'; + BC.errs() << "BOLT-ERROR: cannot relax " << (IsADR ? "ADR" : "LDR") + << " in non-simple function " << BF << '\n'; PassFailed = true; return; } - It = BB.replaceInstruction(It, AdrpAdd); + It = BB.replaceInstruction(It, AdrpMaterialization); } } } -Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) { - if (!opts::AdrPassOpt || !BC.HasRelocations) +Error AArch64RelaxationPass::runOnFunctions(BinaryContext &BC) { + if (!opts::AArch64PassOpt || !BC.HasRelocations) return Error::success(); ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { @@ -108,7 +113,7 @@ Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) { ParallelUtilities::runOnEachFunction( BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr, - "ADRRelaxationPass"); + "AArch64RelaxationPass"); if (PassFailed) return createFatalBOLTError(""); diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt index d7519518f186f..3197e62faad21 100644 --- a/bolt/lib/Passes/CMakeLists.txt +++ b/bolt/lib/Passes/CMakeLists.txt @@ -1,5 +1,5 @@ add_llvm_library(LLVMBOLTPasses - ADRRelaxationPass.cpp + AArch64RelaxationPass.cpp Aligner.cpp AllocCombiner.cpp AsmDump.cpp diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 4e062038a3e4c..8554683bc3cf8 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -564,13 +564,18 @@ void DataAggregator::imputeFallThroughs() { // Skip fall-throughs in external code. if (Trace.From == Trace::EXTERNAL) continue; - std::pair CurrentBranch(Trace.Branch, Trace.From); + if (std::pair CurrentBranch(Trace.Branch, Trace.From); + CurrentBranch != PrevBranch) { + // New group: reset aggregates. + AggregateCount = AggregateFallthroughSize = 0; + PrevBranch = CurrentBranch; + } // BR_ONLY must be the last trace in the group if (Trace.To == Trace::BR_ONLY) { // If the group is not empty, use aggregate values, otherwise 0-length // for unconditional jumps (call/ret/uncond branch) or 1-length for others uint64_t InferredBytes = - PrevBranch == CurrentBranch + AggregateFallthroughSize ? AggregateFallthroughSize / AggregateCount : !checkUnconditionalControlTransfer(Trace.From); Trace.To = Trace.From + InferredBytes; @@ -578,16 +583,11 @@ void DataAggregator::imputeFallThroughs() { << " bytes)\n"); ++InferredTraces; } else { - // Trace with a valid fall-through - // New group: reset aggregates. - if (CurrentBranch != PrevBranch) - AggregateCount = AggregateFallthroughSize = 0; // Only use valid fall-through lengths if (Trace.To != Trace::EXTERNAL) AggregateFallthroughSize += (Trace.To - Trace.From) * Info.TakenCount; AggregateCount += Info.TakenCount; } - PrevBranch = CurrentBranch; } if (opts::Verbosity >= 1) outs() << "BOLT-INFO: imputed " << InferredTraces << " traces\n"; diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 782137e807662..1a0f6d75d63e8 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "bolt/Rewrite/BinaryPassManager.h" -#include "bolt/Passes/ADRRelaxationPass.h" +#include "bolt/Passes/AArch64RelaxationPass.h" #include "bolt/Passes/Aligner.h" #include "bolt/Passes/AllocCombiner.h" #include "bolt/Passes/AsmDump.h" @@ -129,10 +129,10 @@ static cl::opt PrintJTFootprintReduction( cl::desc("print function after jt-footprint-reduction pass"), cl::Hidden, cl::cat(BoltOptCategory)); -static cl::opt - PrintAdrRelaxation("print-adr-relaxation", - cl::desc("print functions after ADR Relaxation pass"), - cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt PrintAArch64Relaxation( + "print-adr-ldr-relaxation", + cl::desc("print functions after ADR/LDR Relaxation pass"), cl::Hidden, + cl::cat(BoltOptCategory)); static cl::opt PrintLongJmp("print-longjmp", @@ -517,7 +517,7 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { if (BC.isAArch64()) { Manager.registerPass( - std::make_unique(PrintAdrRelaxation)); + std::make_unique(PrintAArch64Relaxation)); // Tighten branches according to offset differences between branch and // targets. No extra instructions after this pass, otherwise we may have diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 7769162d67eaf..57db6a436c5c6 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -142,6 +142,7 @@ static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) { atomicAdd(Insts.back(), RegTo, RegTmp); return Insts; } + class AArch64MCPlusBuilder : public MCPlusBuilder { public: using MCPlusBuilder::MCPlusBuilder; @@ -583,6 +584,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return Inst.getOpcode() == AArch64::ADDXri; } + bool isLDRWl(const MCInst &Inst) const override { + return Inst.getOpcode() == AArch64::LDRWl; + } + + bool isLDRXl(const MCInst &Inst) const override { + return Inst.getOpcode() == AArch64::LDRXl; + } + MCPhysReg getADRReg(const MCInst &Inst) const { assert((isADR(Inst) || isADRP(Inst)) && "Not an ADR instruction"); assert(MCPlus::getNumPrimeOperands(Inst) != 0 && @@ -602,6 +611,39 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return materializeAddress(Target, Ctx, Reg, Addend); } + InstructionListType createAdrpLdr(const MCInst &LDRInst, + MCContext *Ctx) const override { + assert((isLDRXl(LDRInst) || isLDRWl(LDRInst)) && + "LDR (literal, 32 or 64-bit integer load) instruction expected"); + assert(LDRInst.getOperand(0).isReg() && + "unexpected operand in LDR instruction"); + const MCPhysReg DataReg = LDRInst.getOperand(0).getReg(); + const MCPhysReg AddrReg = + isLDRXl(LDRInst) ? DataReg + : (MCPhysReg)RegInfo->getMatchingSuperReg( + DataReg, AArch64::sub_32, + &RegInfo->getRegClass(AArch64::GPR64RegClassID)); + const MCSymbol *Target = getTargetSymbol(LDRInst, 1); + assert(Target && "missing target symbol in LDR instruction"); + + InstructionListType Insts(2); + Insts[0].setOpcode(AArch64::ADRP); + Insts[0].clear(); + Insts[0].addOperand(MCOperand::createReg(AddrReg)); + Insts[0].addOperand(MCOperand::createImm(0)); + setOperandToSymbolRef(Insts[0], /* OpNum */ 1, Target, 0, Ctx, + ELF::R_AARCH64_NONE); + Insts[1].setOpcode(isLDRXl(LDRInst) ? AArch64::LDRXui : AArch64::LDRWui); + Insts[1].clear(); + Insts[1].addOperand(MCOperand::createReg(DataReg)); + Insts[1].addOperand(MCOperand::createReg(AddrReg)); + Insts[1].addOperand(MCOperand::createImm(0)); + Insts[1].addOperand(MCOperand::createImm(0)); + setOperandToSymbolRef(Insts[1], /* OpNum */ 2, Target, 0, Ctx, + ELF::R_AARCH64_ADD_ABS_LO12_NC); + return Insts; + } + bool isTB(const MCInst &Inst) const { return (Inst.getOpcode() == AArch64::TBNZW || Inst.getOpcode() == AArch64::TBNZX || @@ -2762,7 +2804,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { BitVector WrittenRegs(RegInfo->getNumRegs()); const BitVector &SizeRegAliases = getAliases(SizeReg); - for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) { + for (auto InstIt = CallInst; InstIt != BB.begin(); --InstIt) { const MCInst &Inst = *InstIt; WrittenRegs.reset(); getWrittenRegs(Inst, WrittenRegs); diff --git a/bolt/test/AArch64/ldr-relaxation.s b/bolt/test/AArch64/ldr-relaxation.s new file mode 100644 index 0000000000000..7632504a01635 --- /dev/null +++ b/bolt/test/AArch64/ldr-relaxation.s @@ -0,0 +1,122 @@ +## Check that LDR relaxation will fail since LDR is inside a non-simple +## function and there is no NOP next to it. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym FAIL=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: not llvm-bolt %t.so -o %t.bolt 2>&1 | FileCheck %s --check-prefix=FAIL + +# FAIL: BOLT-ERROR: cannot relax LDR in non-simple function _start + +.ifdef FAIL + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + br x2 + ldr x0, _foo + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check that LDR relaxation is not needed since the reference is not far away. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym NOT_NEEDED=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=NOT_NEEDED + +# NOT_NEEDED: <_start> +# NOT_NEEDED-NEXT: ldr + +.ifdef NOT_NEEDED + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + ldr x0, _start + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check that LDR relaxation is done in a simple function, where NOP will +## be inserted as needed. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym RELAX_SIMPLE=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX + +# RELAX: adrp +# RELAX-NEXT: ldr + +.ifdef RELAX_SIMPLE + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + ldr x0, _foo + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check that LDR relaxation is done in a non-simple function, where NOP +## exists next to LDR. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym RELAX_NON_SIMPLE=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX + +.ifdef RELAX_NON_SIMPLE + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + br x2 + ldr x0, _foo + nop + ret + .cfi_endproc +.size _start, .-_start +.endif + +## Check LDR relaxation works on loading W (low 32-bit of X) registers. + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --defsym RELAX_SIMPLE_WREG=1 %s -o %t.o +# RUN: %clang %cflags %t.o -o %t.so -Wl,-q +# RUN: llvm-bolt %t.so -o %t.bolt +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAXW + +# RELAXW: adrp x0 +# RELAXW-NEXT: ldr w0 + +.ifdef RELAX_SIMPLE_WREG + .text + .global _start + .type _start, %function +_start: + .cfi_startproc + ldr w0, _foo + ret + .cfi_endproc +.size _start, .-_start +.endif + + .section .text_cold + .global _foo + .align 3 +_foo: + .long 0x12345678 +.size _foo, .-_foo diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s index 8c05491e7bca0..ef0bb55df1faf 100644 --- a/bolt/test/X86/callcont-fallthru.s +++ b/bolt/test/X86/callcont-fallthru.s @@ -15,6 +15,8 @@ # External return to a landing pad/entry point call continuation # RUN: link_fdata %s %t %t.pa-eret PREAGG-ERET # RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT +## Fall-through imputing test cases +# RUN: link_fdata %s %t %t.pa-imp PREAGG-IMP # RUN: llvm-strip --strip-unneeded %t -o %t.strip # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh @@ -63,6 +65,11 @@ # RUN-DISABLED: --check-prefix=CHECK-PLT # CHECK-PLT: traces mismatching disassembled function contents: 0 +## Check --impute-trace-fall-throughs accepting duplicate branch-only traces +# RUN: perf2bolt %t --pa -p %t.pa-imp -o %t.pa-imp.fdata --impute-trace-fall-through +# RUN: FileCheck %s --check-prefix=CHECK-IMP --input-file %t.pa-imp.fdata +# CHECK-IMP: 0 [unknown] 0 1 main {{.*}} 0 3 + .globl foo .type foo, %function foo: @@ -102,6 +109,8 @@ Ltmp1: Ltmp4: cmpl $0x0, -0x14(%rbp) +# PREAGG-IMP: B X:0 #Ltmp4_br# 1 0 +# PREAGG-IMP: B X:0 #Ltmp4_br# 2 0 Ltmp4_br: je Ltmp0 diff --git a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test index 673e86bb1533a..a08e352d605fe 100644 --- a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test +++ b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test @@ -1,4 +1,4 @@ -# UNSUPPORTED: true +# REQUIRES: system-linux ; RUN: rm -rf %t ; RUN: mkdir %t ; RUN: cd %t @@ -8,7 +8,8 @@ ; RUN: llvm-dwp -e main.exe -o main.exe.dwp ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.dwp | FileCheck -check-prefix=PRE-BOLT %s ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.dwp | FileCheck -check-prefix=PRE-BOLT-DWP-TU-INDEX %s -; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp +; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections +; RUN: llvm-dwp -e main.exe.bolt -o main.exe.bolt.dwp ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s diff --git a/bolt/test/X86/unclaimed-pc-rel.s b/bolt/test/X86/unclaimed-pc-rel.s new file mode 100644 index 0000000000000..5292cccba754d --- /dev/null +++ b/bolt/test/X86/unclaimed-pc-rel.s @@ -0,0 +1,24 @@ +## Check that unclaimed PC-relative relocation from data to code is detected +## and reported to the user. + +# REQUIRES: system-linux + +# RUN: %clang %cflags -no-pie %s -o %t.exe -Wl,-q -nostartfiles +# RUN: not llvm-bolt %t.exe -o %t.bolt --strict 2>&1 | FileCheck %s + +# CHECK: BOLT-ERROR: 1 unclaimed PC-relative relocation(s) left in data + + .text + .globl _start + .type _start, %function +_start: + movl $42, %eax +.L0: + ret + .size _start, .-_start + +## Force relocation mode. + .reloc 0, R_X86_64_NONE + + .section .rodata + .long .L0-. diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index dc59a08b889a7..badff299603a0 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -7,7 +7,7 @@ # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM -# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls) +# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 17 total calls) # CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls # Each function should use optimal size-specific instructions and NO memcpy calls @@ -84,6 +84,9 @@ # CHECK-ASM-LABEL: : # CHECK-ASM: bl{{.*}}: +# CHECK-ASM: bl{{.*}}: # CHECK-ASM: bl{{.*}}`_ +for `Extra Clang Tools `_ project. +.. contents:: + :depth: 2 + :local: Active Maintainers ================== diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp index 2e21a4c4fd1f9..611717a64b768 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp @@ -73,6 +73,7 @@ #include "SizeofExpressionCheck.h" #include "SpuriouslyWakeUpFunctionsCheck.h" #include "StandaloneEmptyCheck.h" +#include "StdNamespaceModificationCheck.h" #include "StringConstructorCheck.h" #include "StringIntegerAssignmentCheck.h" #include "StringLiteralWithEmbeddedNulCheck.h" @@ -237,6 +238,8 @@ class BugproneModule : public ClangTidyModule { "bugprone-spuriously-wake-up-functions"); CheckFactories.registerCheck( "bugprone-standalone-empty"); + CheckFactories.registerCheck( + "bugprone-std-namespace-modification"); CheckFactories.registerCheck( "bugprone-string-constructor"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt index 31a0e6906866a..6c96996458040 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt @@ -75,6 +75,7 @@ add_clang_library(clangTidyBugproneModule STATIC SmartPtrArrayMismatchCheck.cpp SpuriouslyWakeUpFunctionsCheck.cpp StandaloneEmptyCheck.cpp + StdNamespaceModificationCheck.cpp StringConstructorCheck.cpp StringIntegerAssignmentCheck.cpp StringLiteralWithEmbeddedNulCheck.cpp diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp similarity index 95% rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp index 79fbc66b5f8a3..13e5c03d7c4d3 100644 --- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "DontModifyStdNamespaceCheck.h" +#include "StdNamespaceModificationCheck.h" #include "clang/ASTMatchers/ASTMatchFinder.h" #include "clang/ASTMatchers/ASTMatchersInternal.h" @@ -36,9 +36,9 @@ AST_POLYMORPHIC_MATCHER_P( } // namespace -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { -void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) { +void StdNamespaceModificationCheck::registerMatchers(MatchFinder *Finder) { auto HasStdParent = hasDeclContext(namespaceDecl(hasAnyName("std", "posix"), unless(hasParent(namespaceDecl()))) @@ -96,7 +96,7 @@ void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) { .bind("decl"), this); } -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) { const NamespaceDecl *LastNS = nullptr; @@ -108,7 +108,7 @@ static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) { return LastNS; } -void clang::tidy::cert::DontModifyStdNamespaceCheck::check( +void clang::tidy::bugprone::StdNamespaceModificationCheck::check( const MatchFinder::MatchResult &Result) { const auto *D = Result.Nodes.getNodeAs("decl"); const auto *NS = Result.Nodes.getNodeAs("nmspc"); diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h similarity index 61% rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h index cfcd878644ddb..0f62dc3d9ab70 100644 --- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h @@ -6,21 +6,21 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H #include "../ClangTidyCheck.h" -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { /// Modification of the std or posix namespace can result in undefined behavior. /// This check warns for such modifications. /// /// For the user-facing documentation see: -/// https://clang.llvm.org/extra/clang-tidy/checks/cert/dcl58-cpp.html -class DontModifyStdNamespaceCheck : public ClangTidyCheck { +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/std-namespace-modification.html +class StdNamespaceModificationCheck : public ClangTidyCheck { public: - DontModifyStdNamespaceCheck(StringRef Name, ClangTidyContext *Context) + StdNamespaceModificationCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context) {} bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { return LangOpts.CPlusPlus; @@ -29,6 +29,6 @@ class DontModifyStdNamespaceCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; }; -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp index d5179770008e5..15592d5c03c06 100644 --- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp @@ -19,6 +19,7 @@ #include "../bugprone/SignedCharMisuseCheck.h" #include "../bugprone/SizeofExpressionCheck.h" #include "../bugprone/SpuriouslyWakeUpFunctionsCheck.h" +#include "../bugprone/StdNamespaceModificationCheck.h" #include "../bugprone/SuspiciousMemoryComparisonCheck.h" #include "../bugprone/ThrowingStaticInitializationCheck.h" #include "../bugprone/UncheckedStringToNumberConversionCheck.h" @@ -36,7 +37,6 @@ #include "../performance/MoveConstructorInitCheck.h" #include "../readability/EnumInitialValueCheck.h" #include "../readability/UppercaseLiteralSuffixCheck.h" -#include "DontModifyStdNamespaceCheck.h" #include "FloatLoopCounter.h" #include "LimitedRandomnessCheck.h" #include "MutatingCopyCheck.h" @@ -251,7 +251,8 @@ class CERTModule : public ClangTidyModule { "cert-dcl51-cpp"); CheckFactories.registerCheck( "cert-dcl54-cpp"); - CheckFactories.registerCheck("cert-dcl58-cpp"); + CheckFactories.registerCheck( + "cert-dcl58-cpp"); CheckFactories.registerCheck( "cert-dcl59-cpp"); // ERR diff --git a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt index db3b2f5a08286..27cf392ce0bb1 100644 --- a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt @@ -5,7 +5,6 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangTidyCERTModule STATIC CERTTidyModule.cpp - DontModifyStdNamespaceCheck.cpp FloatLoopCounter.cpp LimitedRandomnessCheck.cpp MutatingCopyCheck.cpp diff --git a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp index 16febeca70809..b557066d979f5 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp @@ -79,7 +79,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/DeclCXX.h" -#include "clang/AST/Type.h" +#include "clang/AST/TypeBase.h" #include "clang/AST/TypeLoc.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" @@ -116,7 +116,8 @@ std::string removePureVirtualSyntax(const std::string &MethodDecl, DeclString += Tk.text(); if (Tk.Kind != tok::l_paren && Next.Kind != tok::comma && - Next.Kind != tok::r_paren && Next.Kind != tok::l_paren) + Next.Kind != tok::r_paren && Next.Kind != tok::l_paren && + Tk.Kind != tok::coloncolon && Next.Kind != tok::coloncolon) DeclString += ' '; } // Trim the last whitespace. diff --git a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp index b7dcbee1650ec..72095ab2f5982 100644 --- a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp +++ b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp @@ -715,6 +715,45 @@ class D : public B { EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied; } +TEST_F(OverridePureVirtualsTests, QualifiedNames) { + constexpr auto Before = R"cpp( +namespace foo { struct S{}; namespace bar { struct S2{}; } } + +class B { +public: + virtual foo::S foo(int var = 0) = 0; + virtual foo::bar::S2 bar(int var = 0) = 0; +}; + +class ^D : public B {}; +)cpp"; + + constexpr auto Expected = R"cpp( +namespace foo { struct S{}; namespace bar { struct S2{}; } } + +class B { +public: + virtual foo::S foo(int var = 0) = 0; + virtual foo::bar::S2 bar(int var = 0) = 0; +}; + +class D : public B { +public: + foo::S foo(int var = 0) override { + // TODO: Implement this pure virtual method. + static_assert(false, "Method `foo` is not implemented."); + } + + foo::bar::S2 bar(int var = 0) override { + // TODO: Implement this pure virtual method. + static_assert(false, "Method `bar` is not implemented."); + } +}; +)cpp"; + auto Applied = apply(Before); + EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied; +} + } // namespace } // namespace clangd } // namespace clang diff --git a/clang-tools-extra/docs/Maintainers.rst b/clang-tools-extra/docs/Maintainers.rst new file mode 100644 index 0000000000000..f78e9ecf279a6 --- /dev/null +++ b/clang-tools-extra/docs/Maintainers.rst @@ -0,0 +1 @@ +.. include:: ../Maintainers.rst \ No newline at end of file diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 35a64395f04e9..88eb0ebff4501 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -244,6 +244,11 @@ New check aliases ` keeping initial check as an alias to the new one. +- Renamed :doc:`cert-dcl58-cpp ` to + :doc:`bugprone-std-namespace-modification + ` + keeping initial check as an alias to the new one. + - Renamed :doc:`cert-env33-c ` to :doc:`bugprone-command-processor ` @@ -379,7 +384,8 @@ Changes in existing checks ` check to avoid false positives when pointers is transferred to non-const references and avoid false positives of function pointer and fix false - positives on return of non-const pointer. + positives on return of non-const pointer and fix false positives on + pointer-to-member operator. - Improved :doc:`misc-header-include-cycle ` check performance. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst new file mode 100644 index 0000000000000..c6e5608280264 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst @@ -0,0 +1,63 @@ +.. title:: clang-tidy - bugprone-std-namespace-modification + +bugprone-std-namespace-modification +=================================== + +Warns on modifications of the ``std`` or ``posix`` namespaces which can +result in undefined behavior. + +The ``std`` (or ``posix``) namespace is allowed to be extended with (class or +function) template specializations that depend on an user-defined type (a type +that is not defined in the standard system headers). + +The check detects the following (user provided) declarations in namespace ``std`` or ``posix``: + +- Anything that is not a template specialization. +- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument. +- Explicit specializations of any member function of a standard library class template. +- Explicit specializations of any member function template of a standard library class or class template. +- Explicit or partial specialization of any member class template of a standard library class or class template. + +Examples: + +.. code-block:: c++ + + namespace std { + int x; // warning: modification of 'std' namespace can result in undefined behavior [bugprone-dont-modify-std-namespace] + } + + namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior + } + + template <> + struct ::std::hash { // warning: modification of 'std' namespace can result in undefined behavior + unsigned long operator()(const long &K) const { + return K; + } + }; + + struct MyData { long data; }; + + template <> + struct ::std::hash { // no warning: specialization with user-defined type + unsigned long operator()(const MyData &K) const { + return K.data; + } + }; + + namespace std { + template <> + void swap(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior + + template <> + bool less::operator()(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior + return true; + } + } + +References +---------- + +This check corresponds to the CERT C++ Coding Standard rule +`DCL58-CPP. Do not modify the standard namespaces +`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst index fbcc6281a8898..1b8c2c4f97dde 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst @@ -3,57 +3,9 @@ cert-dcl58-cpp ============== -Modification of the ``std`` or ``posix`` namespace can result in undefined -behavior. -This check warns for such modifications. -The ``std`` (or ``posix``) namespace is allowed to be extended with (class or -function) template specializations that depend on an user-defined type (a type -that is not defined in the standard system headers). - -The check detects the following (user provided) declarations in namespace ``std`` or ``posix``: - -- Anything that is not a template specialization. -- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument. -- Explicit specializations of any member function of a standard library class template. -- Explicit specializations of any member function template of a standard library class or class template. -- Explicit or partial specialization of any member class template of a standard library class or class template. - -Examples: - -.. code-block:: c++ - - namespace std { - int x; // warning: modification of 'std' namespace can result in undefined behavior [cert-dcl58-cpp] - } - - namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior - } - - template <> - struct ::std::hash { // warning: modification of 'std' namespace can result in undefined behavior - unsigned long operator()(const long &K) const { - return K; - } - }; - - struct MyData { long data; }; - - template <> - struct ::std::hash { // no warning: specialization with user-defined type - unsigned long operator()(const MyData &K) const { - return K.data; - } - }; - - namespace std { - template <> - void swap(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior - - template <> - bool less::operator()(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior - return true; - } - } +The `cert-dcl58-cpp` is an aliaes, please see +`bugprone-std-namespace-modification <../bugprone/std-namespace-modification.html>`_ +for more information. This check corresponds to the CERT C++ Coding Standard rule `DCL58-CPP. Do not modify the standard namespaces diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index e14ac715cfeeb..5094bcc90caf3 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -141,6 +141,7 @@ Clang-Tidy Checks :doc:`bugprone-sizeof-expression `, :doc:`bugprone-spuriously-wake-up-functions `, :doc:`bugprone-standalone-empty `, "Yes" + :doc:`bugprone-std-namespace-modification `, :doc:`bugprone-string-constructor `, "Yes" :doc:`bugprone-string-integer-assignment `, "Yes" :doc:`bugprone-string-literal-with-embedded-nul `, @@ -175,7 +176,6 @@ Clang-Tidy Checks :doc:`bugprone-unused-return-value `, :doc:`bugprone-use-after-move `, :doc:`bugprone-virtual-near-miss `, "Yes" - :doc:`cert-dcl58-cpp `, :doc:`cert-err33-c `, :doc:`cert-err60-cpp `, :doc:`cert-flp30-c `, @@ -441,6 +441,7 @@ Check aliases :doc:`cert-dcl50-cpp `, :doc:`modernize-avoid-variadic-functions `, :doc:`cert-dcl51-cpp `, :doc:`bugprone-reserved-identifier `, "Yes" :doc:`cert-dcl54-cpp `, :doc:`misc-new-delete-overloads `, + :doc:`cert-dcl58-cpp `, :doc:`bugprone-std-namespace-modification `, :doc:`cert-dcl59-cpp `, :doc:`google-build-namespaces `, :doc:`cert-env33-c `, :doc:`bugprone-command-processor `, :doc:`cert-err09-cpp `, :doc:`misc-throw-by-value-catch-by-reference `, diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst index ec9ef1c60913c..6c994a48d83de 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst @@ -14,3 +14,21 @@ should be generally avoided. // becomes static std::string Moo = (Twine("bark") + "bah").str(); + +The ``Twine`` does not own the memory of its contents, so it is not +recommended to use ``Twine`` created from temporary strings or string literals. + +.. code-block:: c++ + + static Twine getModuleIdentifier(StringRef moduleName) { + return moduleName + "_module"; + } + void foo() { + Twine result = getModuleIdentifier(std::string{"abc"} + "def"); + // temporary std::string is destroyed here, result is dangling + } + +After applying this fix-it hints, the code will use ``std::string`` instead of +``Twine`` for local variables. However, ``Twine`` has lots of methods that +are incompatible with ``std::string``, so the user may need to adjust the code +manually after applying the fix-it hints. diff --git a/clang-tools-extra/docs/index.rst b/clang-tools-extra/docs/index.rst index 3f3a99d1b70c6..eba4a2cdbc558 100644 --- a/clang-tools-extra/docs/index.rst +++ b/clang-tools-extra/docs/index.rst @@ -22,6 +22,7 @@ Contents pp-trace clangd clang-doc + Maintainers Doxygen Documentation diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h index b6977cd9ce6c6..0870f60eaa39b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h +++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h @@ -59,7 +59,7 @@ struct X {}; } // namespace std // Template specializations that are in a system-header file. -// The purpose is to test cert-dcl58-cpp (no warnings here). +// The purpose is to test bugprone-std-namespace-modification (no warnings here). namespace std { template <> void swap(short &, short &){}; diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp similarity index 97% rename from clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp index 01964e7dc6c76..32bcbcaa21c0d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++17-or-later %s cert-dcl58-cpp %t -- -- -I %clang_tidy_headers +// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-std-namespace-modification %t -- -- -I %clang_tidy_headers #include "system-header-simulation.h" @@ -15,7 +15,7 @@ namespace A { } namespace posix { -// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [cert-dcl58-cpp] +// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [bugprone-std-namespace-modification] // CHECK-MESSAGES: :[[@LINE-2]]:11: note: 'posix' namespace opened here namespace foo { int foobar; diff --git a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp index ea9896e1cfde2..fccbd9b3740bd 100644 --- a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp +++ b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp @@ -39,7 +39,6 @@ // CHECK-NEXT: Reason: EnterFile // CHECK-NEXT: FileType: C_User // CHECK-NEXT: PrevFID: (invalid) -// CHECK: - Callback: MacroDefined // CHECK: - Callback: FileChanged // CHECK-NEXT: Loc: ":1:1" // CHECK-NEXT: Reason: ExitFile diff --git a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp index 7c2a231101070..5bd38e0dade28 100644 --- a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp +++ b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp @@ -40,7 +40,6 @@ X // CHECK-NEXT: MacroNameTok: __STDC_EMBED_EMPTY__ // CHECK-NEXT: MacroDirective: MD_Define // CHECK: - Callback: MacroDefined -// CHECK: - Callback: MacroDefined // CHECK-NEXT: MacroNameTok: MACRO // CHECK-NEXT: MacroDirective: MD_Define // CHECK-NEXT: - Callback: MacroExpands diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index baa0bbb5ea631..b7c18c2b5c6cf 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -2409,6 +2409,16 @@ those modes. Use ``__has_feature(c_fixed_enum)`` to determine whether support for fixed underlying types is available in C23 and later. +Enumerations with no enumerators +-------------------------------- + +Clang provides support for Microsoft extensions to support enumerations with no enumerators. + +.. code-block:: c++ + + typedef enum empty { } A; + + Interoperability with C++11 lambdas ----------------------------------- diff --git a/clang/docs/LibClang.rst b/clang/docs/LibClang.rst index e747022b9c173..6c62bcb5f8c29 100644 --- a/clang/docs/LibClang.rst +++ b/clang/docs/LibClang.rst @@ -38,6 +38,7 @@ Code example .. code-block:: cpp + // main.cpp #include #include @@ -57,6 +58,22 @@ Code example CXCursor cursor = clang_getTranslationUnitCursor(unit); //Obtain a cursor at the root of the translation unit } +.. code-block:: cmake + + # CMakeLists.txt + cmake_minimum_required(VERSION 3.20) + project(my_clang_tool VERSION 0.1.0) + + # This will find the default system installation of Clang; if you want to + # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang + # to the CMake configure command, where /foobar is the build directory where + # you built Clang. + find_package(Clang CONFIG REQUIRED) + + add_executable(my_clang_tool main.cpp) + target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS}) + target_link_libraries(my_clang_tool PRIVATE libclang) + Visiting elements of an AST ~~~~~~~~~~~~~~~~~~~~~~~~~~~ The elements of an AST can be recursively visited with pre-order traversal with ``clang_visitChildren``. @@ -283,6 +300,7 @@ Complete example code .. code-block:: cpp + // main.cpp #include #include @@ -356,6 +374,21 @@ Complete example code ); } +.. code-block:: cmake + + # CMakeLists.txt + cmake_minimum_required(VERSION 3.20) + project(my_clang_tool VERSION 0.1.0) + + # This will find the default system installation of Clang; if you want to + # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang + # to the CMake configure command, where /foobar is the build directory where + # you built Clang. + find_package(Clang CONFIG REQUIRED) + + add_executable(my_clang_tool main.cpp) + target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS}) + target_link_libraries(my_clang_tool PRIVATE libclang) .. _Index.h: https://github.com/llvm/llvm-project/blob/main/clang/include/clang-c/Index.h diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6959e61cac980..32f669f8d70d8 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -209,6 +209,8 @@ C23 Feature Support `WG14 N2710 `_. - Fixed accepting as compatible unnamed tag types with the same fields within the same translation unit but from different types. +- ``-MG`` now silences the "file not found" errors with ``#embed`` when + scanning for dependencies and encountering an unknown file. #GH165632 Non-comprehensive list of changes in this release ------------------------------------------------- @@ -355,7 +357,7 @@ Improvements to Clang's diagnostics potential misaligned members get processed before they can get discarded. (#GH144729) -- Clang now emits dignostic with correct message in case of assigning to const reference captured in lambda. (#GH105647) +- Clang now emits a diagnostic with the correct message in case of assigning to const reference captured in lambda. (#GH105647) - Fixed false positive in ``-Wmissing-noreturn`` diagnostic when it was requiring the usage of ``[[noreturn]]`` on lambdas before C++23 (#GH154493). @@ -395,6 +397,11 @@ Improvements to Clang's diagnostics that were previously incorrectly accepted in case of other irrelevant conditions are now consistently diagnosed, identical to C++ mode. +- Fix false-positive unused label diagnostic when a label is used in a named break + or continue (#GH166013) +- Clang now emits a diagnostic in case `vector_size` or `ext_vector_type` + attributes are used with a negative size (#GH165463). + Improvements to Clang's time-trace ---------------------------------- @@ -437,6 +444,7 @@ Bug Fixes in This Version - Fixed a failed assertion with empty filename arguments in ``__has_embed``. (#GH159898) - Fixed a failed assertion with empty filename in ``#embed`` directive. (#GH162951) - Fixed a crash triggered by unterminated ``__has_embed``. (#GH162953) +- Accept empty enumerations in MSVC-compatible C mode. (#GH114402) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -457,6 +465,7 @@ Bug Fixes to Attribute Support - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075) - Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905) - Fix handling of parameter indexes when an attribute is applied to a C++23 explicit object member function. +- Fixed several false positives and false negatives in function effect (`nonblocking`) analysis. (#GH166078) (#GH166101) (#GH166110) Bug Fixes to C++ Support ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -471,7 +480,7 @@ Bug Fixes to C++ Support casts that are guaranteed to fail (#GH137518). - Fix bug rejecting partial specialization of variable templates with auto NTTPs (#GH118190). - Fix a crash if errors "member of anonymous [...] redeclares" and - "intializing multiple members of union" coincide (#GH149985). + "initializing multiple members of union" coincide (#GH149985). - Fix a crash when using ``explicit(bool)`` in pre-C++11 language modes. (#GH152729) - Fix the parsing of variadic member functions when the ellipis immediately follows a default argument.(#GH153445) - Fixed a bug that caused ``this`` captured by value in a lambda with a dependent explicit object parameter to not be @@ -501,6 +510,7 @@ Bug Fixes to C++ Support nontrivial member when another member has an initializer. (#GH81774) - Fixed a template depth issue when parsing lambdas inside a type constraint. (#GH162092) - Diagnose unresolved overload sets in non-dependent compound requirements. (#GH51246) (#GH97753) +- Fix a crash when extracting unavailable member type from alias in template deduction. (#GH165560) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/APNumericStorage.h b/clang/include/clang/AST/APNumericStorage.h index e1948a552bf7e..04424086b98cf 100644 --- a/clang/include/clang/AST/APNumericStorage.h +++ b/clang/include/clang/AST/APNumericStorage.h @@ -41,9 +41,8 @@ class APNumericStorage { llvm::APInt getIntValue() const { unsigned NumWords = llvm::APInt::getNumWords(BitWidth); if (NumWords > 1) - return llvm::APInt(BitWidth, NumWords, pVal); - else - return llvm::APInt(BitWidth, VAL); + return llvm::APInt(BitWidth, llvm::ArrayRef(pVal, NumWords)); + return llvm::APInt(BitWidth, VAL); } void setIntValue(const ASTContext &C, const llvm::APInt &Val); }; diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h index 0d187eb49d6ca..064a342aa0684 100644 --- a/clang/include/clang/AST/AbstractBasicReader.h +++ b/clang/include/clang/AST/AbstractBasicReader.h @@ -173,7 +173,7 @@ class DataStreamBasicReader : public BasicReaderBase { llvm::SmallVector data; for (uint32_t i = 0; i != numWords; ++i) data.push_back(asImpl().readUInt64()); - return llvm::APInt(bitWidth, numWords, &data[0]); + return llvm::APInt(bitWidth, data); } llvm::FixedPointSemantics readFixedPointSemantics() { diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h index 91ffbb169f947..eb532bc8be3a7 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h @@ -23,7 +23,11 @@ #include "clang/Analysis/Analyses/LifetimeSafety/Facts.h" #include "clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h" #include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" #include "clang/Analysis/AnalysisDeclContext.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/raw_ostream.h" +#include namespace clang::lifetimes { @@ -44,10 +48,6 @@ class LifetimeSafetyReporter { Confidence Confidence) {} }; -/// The main entry point for the analysis. -void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC, - LifetimeSafetyReporter *Reporter); - namespace internal { /// An object to hold the factories for immutable collections, ensuring /// that all created states share the same underlying memory management. @@ -60,6 +60,7 @@ struct LifetimeFactory { /// Running the lifetime safety analysis and querying its results. It /// encapsulates the various dataflow analyses. class LifetimeSafetyAnalysis { + public: LifetimeSafetyAnalysis(AnalysisDeclContext &AC, LifetimeSafetyReporter *Reporter); @@ -82,6 +83,12 @@ class LifetimeSafetyAnalysis { std::unique_ptr LoanPropagation; }; } // namespace internal + +/// The main entry point for the analysis. +std::unique_ptr +runLifetimeSafetyAnalysis(AnalysisDeclContext &AC, + LifetimeSafetyReporter *Reporter); + } // namespace clang::lifetimes #endif // LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h index ba138b078b379..26686a63e9204 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h @@ -16,7 +16,10 @@ #include "clang/AST/Decl.h" #include "clang/AST/Expr.h" +#include "clang/AST/TypeBase.h" #include "clang/Analysis/Analyses/LifetimeSafety/Utils.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Support/raw_ostream.h" namespace clang::lifetimes::internal { @@ -76,6 +79,8 @@ class OriginManager { void dump(OriginID OID, llvm::raw_ostream &OS) const; + const llvm::StringMap getMissingOrigins() const; + private: OriginID getNextOriginID() { return NextOriginID++; } @@ -85,6 +90,7 @@ class OriginManager { llvm::SmallVector AllOrigins; llvm::DenseMap DeclToOriginID; llvm::DenseMap ExprToOriginID; + llvm::StringMap ExprTypeToMissingOriginCount; }; } // namespace clang::lifetimes::internal diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 2b400b012d6ed..0275447e1090a 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -5235,6 +5235,12 @@ def HLSLGetSpirvSpecConstant : LangBuiltin<"HLSL_LANG">, HLSLScalarTemplate { let Prototype = "T(unsigned int, T)"; } +def HLSLF16ToF32 : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_elementwise_f16tof32"]; + let Attributes = [NoThrow, Const, CustomTypeChecking]; + let Prototype = "void(...)"; +} + // Builtins for XRay. def XRayCustomEvent : Builtin { let Spellings = ["__xray_customevent"]; diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 9e877b92eac68..4388c09423a21 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -1765,75 +1765,48 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in { def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; -} - -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">; -} - -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">; -} - -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">; -} - -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">; + def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">; + def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">; def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">; + def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">; + def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { + def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">; def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">; + def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">; + def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">; } -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">; } -let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">; } -let Features = "avx512vbmi", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512vbmi", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">; } -let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">; } diff --git a/clang/include/clang/Basic/DebugOptions.def b/clang/include/clang/Basic/DebugOptions.def index a768b12fa4e0d..ea3636ffa1af1 100644 --- a/clang/include/clang/Basic/DebugOptions.def +++ b/clang/include/clang/Basic/DebugOptions.def @@ -46,6 +46,8 @@ ENUM_DEBUGOPT(EmitDwarfUnwind, EmitDwarfUnwindType, 2, DEBUGOPT(NoDwarfDirectoryAsm , 1, 0, Benign) ///< Set when -fno-dwarf-directory-asm ///< is enabled. +DEBUGOPT(Dwarf2CFIAsm, 1, 0, NotCompatible) ///< Set when -fdwarf2-cfi-asm is enabled. + DEBUGOPT(NoInlineLineTables, 1, 0, Benign) ///< Whether debug info should contain ///< inline line tables. diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 8aa3489a2a62b..1e0321de3f4b6 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1436,6 +1436,7 @@ def MicrosoftDrectveSection : DiagGroup<"microsoft-drectve-section">; def MicrosoftInclude : DiagGroup<"microsoft-include">; def MicrosoftCppMacro : DiagGroup<"microsoft-cpp-macro">; def MicrosoftFixedEnum : DiagGroup<"microsoft-fixed-enum">; +def MicrosoftEmptyEnum : DiagGroup<"microsoft-empty-enum">; def MicrosoftSealed : DiagGroup<"microsoft-sealed">; def MicrosoftAbstract : DiagGroup<"microsoft-abstract">; def MicrosoftUnqualifiedFriend : DiagGroup<"microsoft-unqualified-friend">; @@ -1489,7 +1490,8 @@ def Microsoft : DiagGroup<"microsoft", MicrosoftConstInit, MicrosoftVoidPseudoDtor, MicrosoftAnonTag, MicrosoftCommentPaste, MicrosoftEndOfFile, MicrosoftInitFromPredefined, MicrosoftStringLiteralFromPredefined, - MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction]>; + MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction, + MicrosoftEmptyEnum]>; def ClangClPch : DiagGroup<"clang-cl-pch">; diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index e5e071f43fa75..aa0ccb0c05101 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -116,6 +116,9 @@ def err_enumerator_unnamed_no_def : Error< def ext_ms_c_enum_fixed_underlying_type : Extension< "enumeration types with a fixed underlying type are a Microsoft extension">, InGroup; +def ext_ms_c_empty_enum_type : Extension< + "empty enumeration types are a Microsoft extension">, + InGroup; def ext_c23_enum_fixed_underlying_type : Extension< "enumeration types with a fixed underlying type are a C23 extension">, InGroup; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 4e369be0bbb92..fa509536bf021 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3510,6 +3510,8 @@ def err_init_method_bad_return_type : Error< "init methods must return an object pointer type, not %0">; def err_attribute_invalid_size : Error< "vector size not an integral multiple of component size">; +def err_attribute_vec_negative_size + : Error<"vector must have non-negative size">; def err_attribute_zero_size : Error<"zero %0 size">; def err_attribute_size_too_large : Error<"%0 size too large">; def err_typecheck_sve_rvv_ambiguous : Error< diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h index ed967fd47dc83..6d9d074d78026 100644 --- a/clang/include/clang/Basic/SourceManager.h +++ b/clang/include/clang/Basic/SourceManager.h @@ -1286,16 +1286,7 @@ class SourceManager : public RefCountedBase { /// If the location is an expansion record, walk through it until we find /// the final location expanded. FileIDAndOffset getDecomposedExpansionLoc(SourceLocation Loc) const { - FileID FID = getFileID(Loc); - auto *E = getSLocEntryOrNull(FID); - if (!E) - return std::make_pair(FileID(), 0); - - unsigned Offset = Loc.getOffset()-E->getOffset(); - if (Loc.isFileID()) - return std::make_pair(FID, Offset); - - return getDecomposedExpansionLocSlowCase(E); + return getDecomposedLoc(getExpansionLoc(Loc)); } /// Decompose the specified location into a raw FileID + Offset pair. @@ -1303,15 +1294,7 @@ class SourceManager : public RefCountedBase { /// If the location is an expansion record, walk through it until we find /// its spelling record. FileIDAndOffset getDecomposedSpellingLoc(SourceLocation Loc) const { - FileID FID = getFileID(Loc); - auto *E = getSLocEntryOrNull(FID); - if (!E) - return std::make_pair(FileID(), 0); - - unsigned Offset = Loc.getOffset()-E->getOffset(); - if (Loc.isFileID()) - return std::make_pair(FID, Offset); - return getDecomposedSpellingLocSlowCase(E, Offset); + return getDecomposedLoc(getSpellingLoc(Loc)); } /// Returns the "included/expanded in" decomposed location of the given @@ -1979,10 +1962,6 @@ class SourceManager : public RefCountedBase { SourceLocation getSpellingLocSlowCase(SourceLocation Loc) const; SourceLocation getFileLocSlowCase(SourceLocation Loc) const; - FileIDAndOffset - getDecomposedExpansionLocSlowCase(const SrcMgr::SLocEntry *E) const; - FileIDAndOffset getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E, - unsigned Offset) const; void computeMacroArgsCache(MacroArgsMap &MacroArgsCache, FileID FID) const; void associateFileChunkWithMacroArgExp(MacroArgsMap &MacroArgsCache, FileID FID, diff --git a/clang/include/clang/Basic/riscv_sifive_vector.td b/clang/include/clang/Basic/riscv_sifive_vector.td index 89e644a078682..0371279aafc08 100644 --- a/clang/include/clang/Basic/riscv_sifive_vector.td +++ b/clang/include/clang/Basic/riscv_sifive_vector.td @@ -121,6 +121,13 @@ multiclass RVVVQMACCQOQBuiltinSet> suffixes_prototypes> { defm NAME : RVVOutOp1Op2BuiltinSet; } +multiclass RVVVFEXPBuiltinSet> suffixes_prototypes, string type_range> { + let UnMaskedPolicyScheme = HasPassthruOperand, + OverloadedName = NAME, + Log2LMUL = [-2, -1, 0, 1, 2, 3] in + defm NAME : RVVOutBuiltinSet; +} + multiclass RVVVFNRCLIPBuiltinSet { let Log2LMUL = [-3, -2, -1, 0, 1, 2], Name = NAME, @@ -145,6 +152,26 @@ let UnMaskedPolicyScheme = HasPolicyOperand in defm sf_vqmaccsu_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>; } +let RequiredFeatures = ["xsfvfbfexp16e"] in { + defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "y">; +} + +let RequiredFeatures = ["xsfvfexp16e"] in { + defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "x">; +} + +let RequiredFeatures = ["xsfvfexp32e"] in { + defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "f">; +} + +let RequiredFeatures = ["xsfvfexpa"] in { + defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "xf">; +} + +let RequiredFeatures = ["xsfvfexpa64e"] in { + defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "d">; +} + let UnMaskedPolicyScheme = HasPolicyOperand in let RequiredFeatures = ["xsfvfwmaccqqq"] in defm sf_vfwmacc_4x4x4 : RVVVFWMACCBuiltinSet<[["", "Fw", "FwFwSvv"]]>; diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index 2b361ed0982c6..6f9a69e697cc3 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -4171,6 +4171,16 @@ def CIR_ATanOp : CIR_UnaryFPToFPBuiltinOp<"atan", "ATanOp"> { }]; } +def CIR_CeilOp : CIR_UnaryFPToFPBuiltinOp<"ceil", "FCeilOp"> { + let summary = "Computes the ceiling of the specified value"; + let description = [{ + `cir.ceil` computes the ceiling of a given value and returns a result + of the same type. + + Floating-point exceptions are ignored, and it does not set `errno`. + }]; +} + def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> { let summary = "Computes the floating-point cosine value"; let description = [{ @@ -4181,6 +4191,16 @@ def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> { }]; } +def CIR_ExpOp : CIR_UnaryFPToFPBuiltinOp<"exp", "ExpOp"> { + let summary = "Computes the floating-point base-e exponential value"; + let description = [{ + `cir.exp` computes the exponential of a floating-point operand and returns + a result of the same type. + + Floating-point exceptions are ignored, and it does not set `errno`. + }]; +} + def CIR_FAbsOp : CIR_UnaryFPToFPBuiltinOp<"fabs", "FAbsOp"> { let summary = "Computes the floating-point absolute value"; let description = [{ diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 48ef8be9fb782..6f099a7027a10 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -180,6 +180,8 @@ struct MissingFeatures { static bool atomicSyncScopeID() { return false; } static bool atomicTypes() { return false; } static bool atomicUseLibCall() { return false; } + static bool atomicMicrosoftVolatile() { return false; } + static bool atomicOpenMP() { return false; } // Global ctor handling static bool globalCtorLexOrder() { return false; } diff --git a/clang/include/clang/CodeGen/ModuleBuilder.h b/clang/include/clang/CodeGen/ModuleBuilder.h index f1b8229edd362..4298ba06c472e 100644 --- a/clang/include/clang/CodeGen/ModuleBuilder.h +++ b/clang/include/clang/CodeGen/ModuleBuilder.h @@ -120,6 +120,23 @@ CodeGenerator *CreateLLVMCodeGen(DiagnosticsEngine &Diags, llvm::LLVMContext &C, CoverageSourceInfo *CoverageInfo = nullptr); +namespace CodeGen { +/// Demangle the artificial function name (\param FuncName) used to encode trap +/// reasons used in debug info for traps (e.g. __builtin_verbose_trap). See +/// `CGDebugInfo::CreateTrapFailureMessageFor`. +/// +/// \param FuncName - The function name to demangle. +/// +/// \return A std::optional. If demangling succeeds the optional will contain +/// a pair of StringRefs where the first field is the trap category and the +/// second is the trap message. These can both be empty. If demangling fails the +/// optional will not contain a value. Note the returned StringRefs if non-empty +/// point into the underlying storage for \param FuncName and thus have the same +/// lifetime. +std::optional> +DemangleTrapReasonInDebugInfo(StringRef FuncName); +} // namespace CodeGen + } // end namespace clang #endif diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 20955ef1b852e..11e81e032d5fc 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -741,6 +741,7 @@ def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[NoXarchOption]>, "Specify a directory where Flang can find 'lib{,32,64}/gcc{,-cross}/$triple/$version'. " "Flang will use the GCC installation with the largest version">; def gcc_triple_EQ : Joined<["--"], "gcc-triple=">, + Visibility<[ClangOption, FlangOption]>, HelpText<"Search for the GCC installation with the specified triple.">; def CC : Flag<["-"], "CC">, Visibility<[ClangOption, CC1Option]>, Group, @@ -950,9 +951,9 @@ def Xarch__ the host system, which can be used to suppress incompatible GPU arguments.}]>, MetaVarName<" ">; def Xarch_host : Separate<["-"], "Xarch_host">, Flags<[NoXarchOption]>, - HelpText<"Pass to the CUDA/HIP host compilation">, MetaVarName<"">; + HelpText<"Pass to host compilation in the offloading toolchain">, MetaVarName<"">; def Xarch_device : Separate<["-"], "Xarch_device">, Flags<[NoXarchOption]>, - HelpText<"Pass to the CUDA/HIP device compilation">, MetaVarName<"">; + HelpText<"Pass to device compilation in the offloading toolchain">, MetaVarName<"">; def Xassembler : Separate<["-"], "Xassembler">, HelpText<"Pass to the assembler">, MetaVarName<"">, Group; @@ -2154,8 +2155,12 @@ defm dollars_in_identifiers : BoolFOption<"dollars-in-identifiers", PosFlag, NegFlag, BothFlags<[], [ClangOption, CC1Option], " '$' in identifiers">>; -def fdwarf2_cfi_asm : Flag<["-"], "fdwarf2-cfi-asm">, Group; -def fno_dwarf2_cfi_asm : Flag<["-"], "fno-dwarf2-cfi-asm">, Group; + +defm dwarf2_cfi_asm + : BoolFOption<"dwarf2-cfi-asm", CodeGenOpts<"Dwarf2CFIAsm">, DefaultFalse, + PosFlag, + NegFlag>; + defm dwarf_directory_asm : BoolFOption<"dwarf-directory-asm", CodeGenOpts<"NoDwarfDirectoryAsm">, DefaultFalse, NegFlag, diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h index 4103c3f006a8f..604039ef61cb7 100644 --- a/clang/include/clang/Sema/AnalysisBasedWarnings.h +++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h @@ -14,7 +14,10 @@ #define LLVM_CLANG_SEMA_ANALYSISBASEDWARNINGS_H #include "clang/AST/Decl.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Facts.h" +#include "clang/Analysis/AnalysisDeclContext.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringMap.h" #include namespace clang { @@ -95,6 +98,9 @@ class AnalysisBasedWarnings { /// a single function. unsigned MaxUninitAnalysisBlockVisitsPerFunction; + /// Map from expressions missing origin in OriginManager to their counts. + llvm::StringMap MissingOriginCount; + /// @} public: @@ -116,6 +122,9 @@ class AnalysisBasedWarnings { Policy &getPolicyOverrides() { return PolicyOverrides; } void PrintStats() const; + + void FindMissingOrigins(AnalysisDeclContext &AC, + clang::lifetimes::internal::FactManager &FactMgr); }; } // namespace sema diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp index fd0903f2e652c..638028f84ff24 100644 --- a/clang/lib/AST/ByteCode/Disasm.cpp +++ b/clang/lib/AST/ByteCode/Disasm.cpp @@ -436,8 +436,28 @@ LLVM_DUMP_METHOD void Descriptor::dumpFull(unsigned Offset, FO += ElemDesc->getAllocSize(); } + } else if (isPrimitiveArray()) { + OS.indent(Spaces) << "Elements: " << getNumElems() << '\n'; + OS.indent(Spaces) << "Element type: " << primTypeToString(getPrimType()) + << '\n'; + unsigned FO = Offset + sizeof(InitMapPtr); + for (unsigned I = 0; I != getNumElems(); ++I) { + OS.indent(Spaces) << "Element " << I << " offset: " << FO << '\n'; + FO += getElemSize(); + } } else if (isRecord()) { ElemRecord->dump(OS, Indent + 1, Offset); + unsigned I = 0; + for (const Record::Field &F : ElemRecord->fields()) { + OS.indent(Spaces) << "- Field " << I << ": "; + { + ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_RED, true}); + OS << F.Decl->getName(); + } + OS << ". Offset " << (Offset + F.Offset) << "\n"; + F.Desc->dumpFull(Offset + F.Offset, Indent + 1); + ++I; + } } else if (isPrimitive()) { } else { } diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h index 659892e720abf..cc918dc12deb6 100644 --- a/clang/lib/AST/ByteCode/Floating.h +++ b/clang/lib/AST/ByteCode/Floating.h @@ -45,7 +45,8 @@ class Floating final { if (singleWord()) return APFloat(getSemantics(), APInt(BitWidth, Val)); unsigned NumWords = numWords(); - return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory)); + return APFloat(getSemantics(), + APInt(BitWidth, llvm::ArrayRef(Memory, NumWords))); } public: diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h index 6683db941c736..b11e6eea28e3f 100644 --- a/clang/lib/AST/ByteCode/IntegralAP.h +++ b/clang/lib/AST/ByteCode/IntegralAP.h @@ -63,7 +63,7 @@ template class IntegralAP final { if (singleWord()) return APInt(BitWidth, Val, Signed); unsigned NumWords = llvm::APInt::getNumWords(BitWidth); - return llvm::APInt(BitWidth, NumWords, Memory); + return llvm::APInt(BitWidth, llvm::ArrayRef(Memory, NumWords)); } public: diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 8b57b963c538f..9991e365addb8 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3415,18 +3415,46 @@ static bool interp__builtin_ia32_shuffle_generic( GetSourceIndex) { assert(Call->getNumArgs() == 3); - unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue(); + + unsigned ShuffleMask = 0; + Pointer A, MaskVector, B; + + QualType Arg2Type = Call->getArg(2)->getType(); + bool IsVectorMask = false; + if (Arg2Type->isVectorType()) { + IsVectorMask = true; + B = S.Stk.pop(); + MaskVector = S.Stk.pop(); + A = S.Stk.pop(); + } else if (Arg2Type->isIntegerType()) { + ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue(); + B = S.Stk.pop(); + A = S.Stk.pop(); + } else { + return false; + } QualType Arg0Type = Call->getArg(0)->getType(); const auto *VecT = Arg0Type->castAs(); PrimType ElemT = *S.getContext().classify(VecT->getElementType()); unsigned NumElems = VecT->getNumElements(); - const Pointer &B = S.Stk.pop(); - const Pointer &A = S.Stk.pop(); const Pointer &Dst = S.Stk.peek(); + PrimType MaskElemT = PT_Uint32; + if (IsVectorMask) { + QualType Arg1Type = Call->getArg(1)->getType(); + const auto *MaskVecT = Arg1Type->castAs(); + QualType MaskElemType = MaskVecT->getElementType(); + MaskElemT = *S.getContext().classify(MaskElemType); + } + for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) { + if (IsVectorMask) { + INT_TYPE_SWITCH(MaskElemT, { + ShuffleMask = static_cast(MaskVector.elem(DstIdx)); + }); + } auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); if (SrcIdx < 0) { @@ -4434,6 +4462,60 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return std::pair{0, static_cast(DstIdx)}; } }); + case X86::BI__builtin_ia32_vpermi2varq128: + case X86::BI__builtin_ia32_vpermi2varpd128: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x1; + unsigned SrcIdx = (ShuffleMask >> 1) & 0x1; + return std::pair{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_vpermi2vard128: + case X86::BI__builtin_ia32_vpermi2varps128: + case X86::BI__builtin_ia32_vpermi2varq256: + case X86::BI__builtin_ia32_vpermi2varpd256: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3; + unsigned SrcIdx = (ShuffleMask >> 2) & 0x1; + return std::pair{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_vpermi2varhi128: + case X86::BI__builtin_ia32_vpermi2vard256: + case X86::BI__builtin_ia32_vpermi2varps256: + case X86::BI__builtin_ia32_vpermi2varq512: + case X86::BI__builtin_ia32_vpermi2varpd512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x7; + unsigned SrcIdx = (ShuffleMask >> 3) & 0x1; + return std::pair{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_vpermi2varqi128: + case X86::BI__builtin_ia32_vpermi2varhi256: + case X86::BI__builtin_ia32_vpermi2vard512: + case X86::BI__builtin_ia32_vpermi2varps512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0xF; + unsigned SrcIdx = (ShuffleMask >> 4) & 0x1; + return std::pair{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_vpermi2varqi256: + case X86::BI__builtin_ia32_vpermi2varhi512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x1F; + unsigned SrcIdx = (ShuffleMask >> 5) & 0x1; + return std::pair{SrcIdx, Offset}; + }); + case X86::BI__builtin_ia32_vpermi2varqi512: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3F; + unsigned SrcIdx = (ShuffleMask >> 6) & 0x1; + return std::pair{SrcIdx, Offset}; + }); case X86::BI__builtin_ia32_pshufb128: case X86::BI__builtin_ia32_pshufb256: case X86::BI__builtin_ia32_pshufb512: diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp index e0b2852f0e906..2425373ab2ef8 100644 --- a/clang/lib/AST/ByteCode/Program.cpp +++ b/clang/lib/AST/ByteCode/Program.cpp @@ -218,21 +218,42 @@ UnsignedOrNone Program::createGlobal(const ValueDecl *VD, const Expr *Init) { return std::nullopt; Global *NewGlobal = Globals[*Idx]; + // Note that this loop has one iteration where Redecl == VD. for (const Decl *Redecl : VD->redecls()) { - unsigned &PIdx = GlobalIndices[Redecl]; + + // If this redecl was registered as a dummy variable, it is now a proper + // global variable and points to the block we just created. + if (auto DummyIt = DummyVariables.find(Redecl); + DummyIt != DummyVariables.end()) { + assert(!Globals[DummyIt->second]->block()->hasPointers()); + Globals[DummyIt->second] = NewGlobal; + DummyVariables.erase(DummyIt); + } + // If the redeclaration hasn't been registered yet at all, we just set its + // global index to Idx. If it has been registered yet, it might have + // pointers pointing to it and we need to transfer those pointers to the new + // block. + auto [Iter, Inserted] = GlobalIndices.try_emplace(Redecl); + if (Inserted) { + GlobalIndices[Redecl] = *Idx; + continue; + } + if (Redecl != VD) { - if (Block *RedeclBlock = Globals[PIdx]->block(); + if (Block *RedeclBlock = Globals[Iter->second]->block(); RedeclBlock->isExtern()) { - Globals[PIdx] = NewGlobal; + // All pointers pointing to the previous extern decl now point to the // new decl. // A previous iteration might've already fixed up the pointers for this // global. if (RedeclBlock != NewGlobal->block()) RedeclBlock->movePointersTo(NewGlobal->block()); + + Globals[Iter->second] = NewGlobal; } } - PIdx = *Idx; + Iter->second = *Idx; } return *Idx; diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h index 28fcc97f5339d..cc9127dc77860 100644 --- a/clang/lib/AST/ByteCode/Program.h +++ b/clang/lib/AST/ByteCode/Program.h @@ -205,7 +205,6 @@ class Program final { const Block *block() const { return &B; } private: - /// Required metadata - does not actually track pointers. Block B; }; diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 97eeba8b9d6cc..8fab6efafb983 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11628,21 +11628,38 @@ static bool evalShuffleGeneric( if (!VT) return false; - APSInt MaskImm; - if (!EvaluateInteger(Call->getArg(2), MaskImm, Info)) - return false; - unsigned ShuffleMask = static_cast(MaskImm.getZExtValue()); + unsigned ShuffleMask = 0; + APValue A, MaskVector, B; + bool IsVectorMask = false; - APValue A, B; - if (!EvaluateAsRValue(Info, Call->getArg(0), A) || - !EvaluateAsRValue(Info, Call->getArg(1), B)) + QualType Arg2Type = Call->getArg(2)->getType(); + if (Arg2Type->isVectorType()) { + IsVectorMask = true; + if (!EvaluateAsRValue(Info, Call->getArg(0), A) || + !EvaluateAsRValue(Info, Call->getArg(1), MaskVector) || + !EvaluateAsRValue(Info, Call->getArg(2), B)) + return false; + } else if (Arg2Type->isIntegerType()) { + APSInt MaskImm; + if (!EvaluateInteger(Call->getArg(2), MaskImm, Info)) + return false; + ShuffleMask = static_cast(MaskImm.getZExtValue()); + if (!EvaluateAsRValue(Info, Call->getArg(0), A) || + !EvaluateAsRValue(Info, Call->getArg(1), B)) + return false; + } else { return false; + } unsigned NumElts = VT->getNumElements(); - SmallVector ResultElements; + SmallVector ResultElements; ResultElements.reserve(NumElts); for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) { + if (IsVectorMask) { + ShuffleMask = static_cast( + MaskVector.getVectorElt(DstIdx).getInt().getZExtValue()); + } auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask); if (SrcIdx < 0) { @@ -13080,6 +13097,84 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case X86::BI__builtin_ia32_vpermi2varq128: + case X86::BI__builtin_ia32_vpermi2varpd128: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x1; + unsigned SrcIdx = (ShuffleMask >> 1) & 0x1; + return std::pair{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2vard128: + case X86::BI__builtin_ia32_vpermi2varps128: + case X86::BI__builtin_ia32_vpermi2varq256: + case X86::BI__builtin_ia32_vpermi2varpd256: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3; + unsigned SrcIdx = (ShuffleMask >> 2) & 0x1; + return std::pair{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varhi128: + case X86::BI__builtin_ia32_vpermi2vard256: + case X86::BI__builtin_ia32_vpermi2varps256: + case X86::BI__builtin_ia32_vpermi2varq512: + case X86::BI__builtin_ia32_vpermi2varpd512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x7; + unsigned SrcIdx = (ShuffleMask >> 3) & 0x1; + return std::pair{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varqi128: + case X86::BI__builtin_ia32_vpermi2varhi256: + case X86::BI__builtin_ia32_vpermi2vard512: + case X86::BI__builtin_ia32_vpermi2varps512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0xF; + unsigned SrcIdx = (ShuffleMask >> 4) & 0x1; + return std::pair{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varqi256: + case X86::BI__builtin_ia32_vpermi2varhi512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x1F; + unsigned SrcIdx = (ShuffleMask >> 5) & 0x1; + return std::pair{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermi2varqi512: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, + [](unsigned DstIdx, unsigned ShuffleMask) { + int Offset = ShuffleMask & 0x3F; + unsigned SrcIdx = (ShuffleMask >> 6) & 0x1; + return std::pair{SrcIdx, Offset}; + })) + return false; + return Success(R, E); + } } } diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp index 75b17c545bb78..54c30c05c3e19 100644 --- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp +++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp @@ -746,11 +746,14 @@ ExprMutationAnalyzer::Analyzer::findPointeeMemberMutation(const Expr *Exp) { Stm, Context)); if (MemberCallExpr) return MemberCallExpr; - const auto Matches = - match(stmt(forEachDescendant( - memberExpr(hasObjectExpression(canResolveToExprPointee(Exp))) - .bind(NodeID::value))), - Stm, Context); + const auto Matches = match( + stmt(forEachDescendant( + expr(anyOf(memberExpr( + hasObjectExpression(canResolveToExprPointee(Exp))), + binaryOperator(hasOperatorName("->*"), + hasLHS(canResolveToExprPointee(Exp))))) + .bind(NodeID::value))), + Stm, Context); return findExprMutation(Matches); } diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp index 00c7ed90503e7..d183ce976f946 100644 --- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp @@ -23,9 +23,11 @@ #include "clang/Analysis/AnalysisDeclContext.h" #include "clang/Analysis/CFG.h" #include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/StringMap.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TimeProfiler.h" +#include "llvm/Support/raw_ostream.h" #include namespace clang::lifetimes { @@ -69,9 +71,12 @@ void LifetimeSafetyAnalysis::run() { } } // namespace internal -void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC, - LifetimeSafetyReporter *Reporter) { - internal::LifetimeSafetyAnalysis Analysis(AC, Reporter); - Analysis.run(); +std::unique_ptr +runLifetimeSafetyAnalysis(AnalysisDeclContext &AC, + LifetimeSafetyReporter *Reporter) { + std::unique_ptr Analysis = + std::make_unique(AC, Reporter); + Analysis->run(); + return Analysis; } } // namespace clang::lifetimes diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp index ea51a75324e06..b8f705e3377c2 100644 --- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp +++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp @@ -7,6 +7,9 @@ //===----------------------------------------------------------------------===// #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" +#include "clang/AST/Expr.h" +#include "clang/AST/TypeBase.h" +#include "llvm/ADT/StringMap.h" namespace clang::lifetimes::internal { @@ -22,6 +25,10 @@ void OriginManager::dump(OriginID OID, llvm::raw_ostream &OS) const { OS << ")"; } +const llvm::StringMap OriginManager::getMissingOrigins() const { + return ExprTypeToMissingOriginCount; +} + Origin &OriginManager::addOrigin(OriginID ID, const clang::ValueDecl &D) { AllOrigins.emplace_back(ID, &D); return AllOrigins.back(); @@ -37,6 +44,18 @@ OriginID OriginManager::get(const Expr &E) { auto It = ExprToOriginID.find(&E); if (It != ExprToOriginID.end()) return It->second; + + // if the expression has no specific origin, increment the missing origin + // counter. + // const QualType ExprType = E.getType(); + std::string ExprStr(E.getStmtClassName()); + ExprStr = ExprStr + "<" + E.getType().getAsString() + ">"; + auto CountIt = ExprTypeToMissingOriginCount.find(ExprStr); + if (CountIt == ExprTypeToMissingOriginCount.end()) { + ExprTypeToMissingOriginCount[ExprStr] = 1; + } else { + CountIt->second++; + } // If the expression itself has no specific origin, and it's a reference // to a declaration, its origin is that of the declaration it refers to. // For pointer types, where we don't pre-emptively create an origin for the diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 938c6485125ee..7dc81c50f87a2 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -907,58 +907,27 @@ getExpansionLocSlowCase(SourceLocation Loc) const { SourceLocation SourceManager::getSpellingLocSlowCase(SourceLocation Loc) const { do { - FileIDAndOffset LocInfo = getDecomposedLoc(Loc); - Loc = getSLocEntry(LocInfo.first).getExpansion().getSpellingLoc(); - Loc = Loc.getLocWithOffset(LocInfo.second); + const SLocEntry &Entry = getSLocEntry(getFileID(Loc)); + Loc = Entry.getExpansion().getSpellingLoc().getLocWithOffset( + Loc.getOffset() - Entry.getOffset()); } while (!Loc.isFileID()); return Loc; } SourceLocation SourceManager::getFileLocSlowCase(SourceLocation Loc) const { do { - if (isMacroArgExpansion(Loc)) - Loc = getImmediateSpellingLoc(Loc); - else - Loc = getImmediateExpansionRange(Loc).getBegin(); + const SLocEntry &Entry = getSLocEntry(getFileID(Loc)); + const ExpansionInfo &ExpInfo = Entry.getExpansion(); + if (ExpInfo.isMacroArgExpansion()) { + Loc = ExpInfo.getSpellingLoc().getLocWithOffset(Loc.getOffset() - + Entry.getOffset()); + } else { + Loc = ExpInfo.getExpansionLocStart(); + } } while (!Loc.isFileID()); return Loc; } -FileIDAndOffset SourceManager::getDecomposedExpansionLocSlowCase( - const SrcMgr::SLocEntry *E) const { - // If this is an expansion record, walk through all the expansion points. - FileID FID; - SourceLocation Loc; - unsigned Offset; - do { - Loc = E->getExpansion().getExpansionLocStart(); - - FID = getFileID(Loc); - E = &getSLocEntry(FID); - Offset = Loc.getOffset()-E->getOffset(); - } while (!Loc.isFileID()); - - return std::make_pair(FID, Offset); -} - -FileIDAndOffset -SourceManager::getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E, - unsigned Offset) const { - // If this is an expansion record, walk through all the expansion points. - FileID FID; - SourceLocation Loc; - do { - Loc = E->getExpansion().getSpellingLoc(); - Loc = Loc.getLocWithOffset(Offset); - - FID = getFileID(Loc); - E = &getSLocEntry(FID); - Offset = Loc.getOffset()-E->getOffset(); - } while (!Loc.isFileID()); - - return std::make_pair(FID, Offset); -} - /// getImmediateSpellingLoc - Given a SourceLocation object, return the /// spelling location referenced by the ID. This is the first level down /// towards the place where the characters that make up the lexed token can be diff --git a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp index 7db6e283ec0a5..cd4c1f0e5b769 100644 --- a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp @@ -27,6 +27,7 @@ class AtomicInfo { CharUnits atomicAlign; CharUnits valueAlign; TypeEvaluationKind evaluationKind = cir::TEK_Scalar; + bool useLibCall = true; LValue lvalue; mlir::Location loc; @@ -62,8 +63,8 @@ class AtomicInfo { assert(!cir::MissingFeatures::atomicInfo()); cgf.cgm.errorNYI(loc, "AtomicInfo: non-simple lvalue"); } - - assert(!cir::MissingFeatures::atomicUseLibCall()); + useLibCall = !ctx.getTargetInfo().hasBuiltinAtomic( + atomicSizeInBits, ctx.toBits(lvalue.getAlignment())); } QualType getValueType() const { return valueTy; } @@ -75,6 +76,8 @@ class AtomicInfo { assert(!cir::MissingFeatures::atomicInfoGetAtomicPointer()); return nullptr; } + bool shouldUseLibCall() const { return useLibCall; } + const LValue &getAtomicLValue() const { return lvalue; } Address getAtomicAddress() const { mlir::Type elemTy; if (lvalue.isSimple()) { @@ -96,6 +99,8 @@ class AtomicInfo { bool emitMemSetZeroIfNecessary() const; + mlir::Value getScalarRValValueOrNull(RValue rvalue) const; + /// Cast the given pointer to an integer pointer suitable for atomic /// operations on the source. Address castToAtomicIntPointer(Address addr) const; @@ -105,6 +110,9 @@ class AtomicInfo { /// copy the value across. Address convertToAtomicIntPointer(Address addr) const; + /// Converts a rvalue to integer value. + mlir::Value convertRValueToInt(RValue rvalue, bool cmpxchg = false) const; + /// Copy an atomic r-value into atomic-layout memory. void emitCopyIntoMemory(RValue rvalue) const; @@ -195,6 +203,12 @@ Address AtomicInfo::createTempAlloca() const { return tempAlloca; } +mlir::Value AtomicInfo::getScalarRValValueOrNull(RValue rvalue) const { + if (rvalue.isScalar() && (!hasPadding() || !lvalue.isSimple())) + return rvalue.getValue(); + return nullptr; +} + Address AtomicInfo::castToAtomicIntPointer(Address addr) const { auto intTy = mlir::dyn_cast(addr.getElementType()); // Don't bother with int casts if the integer size is the same. @@ -211,10 +225,38 @@ bool AtomicInfo::emitMemSetZeroIfNecessary() const { return false; cgf.cgm.errorNYI(loc, - "AtomicInfo::emitMemSetZeroIfNecessary: emit memset zero"); + "AtomicInfo::emitMemSetZeroIfNecaessary: emit memset zero"); return false; } +/// Return true if \param valueTy is a type that should be casted to integer +/// around the atomic memory operation. If \param cmpxchg is true, then the +/// cast of a floating point type is made as that instruction can not have +/// floating point operands. TODO: Allow compare-and-exchange and FP - see +/// comment in CIRGenAtomicExpandPass.cpp. +static bool shouldCastToInt(mlir::Type valueTy, bool cmpxchg) { + if (cir::isAnyFloatingPointType(valueTy)) + return isa(valueTy) || cmpxchg; + return !isa(valueTy) && !isa(valueTy); +} + +mlir::Value AtomicInfo::convertRValueToInt(RValue rvalue, bool cmpxchg) const { + // If we've got a scalar value of the right size, try to avoid going + // through memory. Floats get casted if needed by AtomicExpandPass. + if (mlir::Value value = getScalarRValValueOrNull(rvalue)) { + if (!shouldCastToInt(value.getType(), cmpxchg)) + return cgf.emitToMemory(value, valueTy); + + cgf.cgm.errorNYI( + loc, "AtomicInfo::convertRValueToInt: cast scalar rvalue to int"); + return nullptr; + } + + cgf.cgm.errorNYI( + loc, "AtomicInfo::convertRValueToInt: cast non-scalar rvalue to int"); + return nullptr; +} + /// Copy an r-value into memory as part of storing to an atomic type. /// This needs to create a bit-pattern suitable for atomic operations. void AtomicInfo::emitCopyIntoMemory(RValue rvalue) const { @@ -815,6 +857,79 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) { e->getExprLoc()); } +void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest, bool isInit) { + bool isVolatile = dest.isVolatileQualified(); + auto order = cir::MemOrder::SequentiallyConsistent; + if (!dest.getType()->isAtomicType()) { + assert(!cir::MissingFeatures::atomicMicrosoftVolatile()); + } + return emitAtomicStore(rvalue, dest, order, isVolatile, isInit); +} + +/// Emit a store to an l-value of atomic type. +/// +/// Note that the r-value is expected to be an r-value of the atomic type; this +/// means that for aggregate r-values, it should include storage for any padding +/// that was necessary. +void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest, + cir::MemOrder order, bool isVolatile, + bool isInit) { + // If this is an aggregate r-value, it should agree in type except + // maybe for address-space qualification. + mlir::Location loc = dest.getPointer().getLoc(); + assert(!rvalue.isAggregate() || + rvalue.getAggregateAddress().getElementType() == + dest.getAddress().getElementType()); + + AtomicInfo atomics(*this, dest, loc); + LValue lvalue = atomics.getAtomicLValue(); + + if (lvalue.isSimple()) { + // If this is an initialization, just put the value there normally. + if (isInit) { + atomics.emitCopyIntoMemory(rvalue); + return; + } + + // Check whether we should use a library call. + if (atomics.shouldUseLibCall()) { + assert(!cir::MissingFeatures::atomicUseLibCall()); + cgm.errorNYI(loc, "emitAtomicStore: atomic store with library call"); + return; + } + + // Okay, we're doing this natively. + mlir::Value valueToStore = atomics.convertRValueToInt(rvalue); + + // Do the atomic store. + Address addr = atomics.getAtomicAddress(); + if (mlir::Value value = atomics.getScalarRValValueOrNull(rvalue)) { + if (shouldCastToInt(value.getType(), /*CmpXchg=*/false)) { + addr = atomics.castToAtomicIntPointer(addr); + valueToStore = + builder.createIntCast(valueToStore, addr.getElementType()); + } + } + cir::StoreOp store = builder.createStore(loc, valueToStore, addr); + + // Initializations don't need to be atomic. + if (!isInit) { + assert(!cir::MissingFeatures::atomicOpenMP()); + store.setMemOrder(order); + } + + // Other decoration. + if (isVolatile) + store.setIsVolatile(true); + + assert(!cir::MissingFeatures::opLoadStoreTbaa()); + return; + } + + cgm.errorNYI(loc, "emitAtomicStore: non-simple atomic lvalue"); + assert(!cir::MissingFeatures::opLoadStoreAtomic()); +} + void CIRGenFunction::emitAtomicInit(Expr *init, LValue dest) { AtomicInfo atomics(*this, dest, getLoc(init->getSourceRange())); diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp index e35100ffe4b6b..0803910f2e83a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -211,6 +211,28 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, assert(!cir::MissingFeatures::fastMathFlags()); return emitUnaryMaybeConstrainedFPBuiltin(*this, *e); + case Builtin::BIceil: + case Builtin::BIceilf: + case Builtin::BIceill: + case Builtin::BI__builtin_ceil: + case Builtin::BI__builtin_ceilf: + case Builtin::BI__builtin_ceilf16: + case Builtin::BI__builtin_ceill: + case Builtin::BI__builtin_ceilf128: + assert(!cir::MissingFeatures::fastMathFlags()); + return emitUnaryMaybeConstrainedFPBuiltin(*this, *e); + + case Builtin::BIexp: + case Builtin::BIexpf: + case Builtin::BIexpl: + case Builtin::BI__builtin_exp: + case Builtin::BI__builtin_expf: + case Builtin::BI__builtin_expf16: + case Builtin::BI__builtin_expl: + case Builtin::BI__builtin_expf128: + assert(!cir::MissingFeatures::fastMathFlags()); + return emitUnaryMaybeConstrainedFPBuiltin(*this, *e); + case Builtin::BIfabs: case Builtin::BIfabsf: case Builtin::BIfabsl: diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp index 851328a7db680..437db306f3369 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp @@ -147,8 +147,8 @@ void *EHScopeStack::pushCleanup(CleanupKind kind, size_t size) { assert(!cir::MissingFeatures::innermostEHScope()); - EHCleanupScope *scope = new (buffer) - EHCleanupScope(size, branchFixups.size(), innermostNormalCleanup); + EHCleanupScope *scope = new (buffer) EHCleanupScope( + size, branchFixups.size(), innermostNormalCleanup, innermostEHScope); if (isNormalCleanup) innermostNormalCleanup = stable_begin(); @@ -191,7 +191,9 @@ void EHScopeStack::popCleanup() { EHCatchScope *EHScopeStack::pushCatch(unsigned numHandlers) { char *buffer = allocate(EHCatchScope::getSizeForNumHandlers(numHandlers)); assert(!cir::MissingFeatures::innermostEHScope()); - EHCatchScope *scope = new (buffer) EHCatchScope(numHandlers); + EHCatchScope *scope = + new (buffer) EHCatchScope(numHandlers, innermostEHScope); + innermostEHScope = stable_begin(); return scope; } diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h index 61a09a59b05c0..a035d792ef6d1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCleanup.h +++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h @@ -30,6 +30,8 @@ struct CatchTypeInfo { /// A protected scope for zero-cost EH handling. class EHScope { + EHScopeStack::stable_iterator enclosingEHScope; + class CommonBitFields { friend class EHScope; unsigned kind : 3; @@ -79,7 +81,10 @@ class EHScope { public: enum Kind { Cleanup, Catch, Terminate, Filter }; - EHScope(Kind kind) { commonBits.kind = kind; } + EHScope(Kind kind, EHScopeStack::stable_iterator enclosingEHScope) + : enclosingEHScope(enclosingEHScope) { + commonBits.kind = kind; + } Kind getKind() const { return static_cast(commonBits.kind); } @@ -90,6 +95,10 @@ class EHScope { assert(!cir::MissingFeatures::ehstackBranches()); return false; } + + EHScopeStack::stable_iterator getEnclosingEHScope() const { + return enclosingEHScope; + } }; /// A scope which attempts to handle some, possibly all, types of @@ -111,6 +120,8 @@ class EHCatchScope : public EHScope { /// The catch handler for this type. mlir::Region *region; + + bool isCatchAll() const { return type.rtti == nullptr; } }; private: @@ -118,12 +129,18 @@ class EHCatchScope : public EHScope { Handler *getHandlers() { return reinterpret_cast(this + 1); } + const Handler *getHandlers() const { + return reinterpret_cast(this + 1); + } + public: static size_t getSizeForNumHandlers(unsigned n) { return sizeof(EHCatchScope) + n * sizeof(Handler); } - EHCatchScope(unsigned numHandlers) : EHScope(Catch) { + EHCatchScope(unsigned numHandlers, + EHScopeStack::stable_iterator enclosingEHScope) + : EHScope(Catch, enclosingEHScope) { catchBits.numHandlers = numHandlers; assert(catchBits.numHandlers == numHandlers && "NumHandlers overflow?"); } @@ -136,6 +153,11 @@ class EHCatchScope : public EHScope { getHandlers()[i].region = region; } + const Handler &getHandler(unsigned i) const { + assert(i < getNumHandlers()); + return getHandlers()[i]; + } + // Clear all handler blocks. // FIXME: it's better to always call clearHandlerBlocks in DTOR and have a // 'takeHandler' or some such function which removes ownership from the @@ -144,6 +166,10 @@ class EHCatchScope : public EHScope { // The blocks are owned by TryOp, nothing to delete. } + using iterator = const Handler *; + iterator begin() const { return getHandlers(); } + iterator end() const { return getHandlers() + getNumHandlers(); } + static bool classof(const EHScope *scope) { return scope->getKind() == Catch; } @@ -176,9 +202,10 @@ class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope } EHCleanupScope(unsigned cleanupSize, unsigned fixupDepth, - EHScopeStack::stable_iterator enclosingNormal) - : EHScope(EHScope::Cleanup), enclosingNormal(enclosingNormal), - fixupDepth(fixupDepth) { + EHScopeStack::stable_iterator enclosingNormal, + EHScopeStack::stable_iterator enclosingEH) + : EHScope(EHScope::Cleanup, enclosingEH), + enclosingNormal(enclosingNormal), fixupDepth(fixupDepth) { // TODO(cir): When exception handling is upstreamed, isNormalCleanup and // isEHCleanup will be arguments to the constructor. cleanupBits.isNormalCleanup = true; @@ -235,13 +262,45 @@ class EHScopeStack::iterator { EHScope *get() const { return reinterpret_cast(ptr); } + EHScope *operator->() const { return get(); } EHScope &operator*() const { return *get(); } + + iterator &operator++() { + size_t size; + switch (get()->getKind()) { + case EHScope::Catch: + size = EHCatchScope::getSizeForNumHandlers( + static_cast(get())->getNumHandlers()); + break; + + case EHScope::Filter: + llvm_unreachable("EHScopeStack::iterator Filter"); + break; + + case EHScope::Cleanup: + llvm_unreachable("EHScopeStack::iterator Cleanup"); + break; + + case EHScope::Terminate: + llvm_unreachable("EHScopeStack::iterator Terminate"); + break; + } + ptr += llvm::alignTo(size, ScopeStackAlignment); + return *this; + } + + bool operator==(iterator other) const { return ptr == other.ptr; } + bool operator!=(iterator other) const { return ptr != other.ptr; } }; inline EHScopeStack::iterator EHScopeStack::begin() const { return iterator(startOfData); } +inline EHScopeStack::iterator EHScopeStack::end() const { + return iterator(endOfBuffer); +} + inline EHScopeStack::iterator EHScopeStack::find(stable_iterator savePoint) const { assert(savePoint.isValid() && "finding invalid savepoint"); @@ -254,7 +313,7 @@ inline void EHScopeStack::popCatch() { assert(!empty() && "popping exception stack when not empty"); EHCatchScope &scope = llvm::cast(*begin()); - assert(!cir::MissingFeatures::innermostEHScope()); + innermostEHScope = scope.getEnclosingEHScope(); deallocate(EHCatchScope::getSizeForNumHandlers(scope.getNumHandlers())); } diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index 5ccb431e626ae..422fa1cf5ad2e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -311,7 +311,8 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e, void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile, QualType ty, - bool isInit, bool isNontemporal) { + LValueBaseInfo baseInfo, bool isInit, + bool isNontemporal) { assert(!cir::MissingFeatures::opLoadStoreThreadLocal()); if (const auto *clangVecTy = ty->getAs()) { @@ -333,7 +334,13 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr, value = emitToMemory(value, ty); - assert(!cir::MissingFeatures::opLoadStoreAtomic()); + assert(!cir::MissingFeatures::opLoadStoreTbaa()); + LValue atomicLValue = LValue::makeAddr(addr, ty, baseInfo); + if (ty->isAtomicType() || + (!isInit && isLValueSuitableForInlineAtomic(atomicLValue))) { + emitAtomicStore(RValue::get(value), atomicLValue, isInit); + return; + } // Update the alloca with more info on initialization. assert(addr.getPointer() && "expected pointer to exist"); @@ -550,7 +557,8 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, LValue lvalue, } emitStoreOfScalar(value, lvalue.getAddress(), lvalue.isVolatile(), - lvalue.getType(), isInit, /*isNontemporal=*/false); + lvalue.getType(), lvalue.getBaseInfo(), isInit, + /*isNontemporal=*/false); } mlir::Value CIRGenFunction::emitLoadOfScalar(Address addr, bool isVolatile, @@ -1630,7 +1638,7 @@ RValue CIRGenFunction::emitAnyExpr(const Expr *e, AggValueSlot aggSlot, bool ignoreResult) { switch (CIRGenFunction::getEvaluationKind(e->getType())) { case cir::TEK_Scalar: - return RValue::get(emitScalarExpr(e)); + return RValue::get(emitScalarExpr(e, ignoreResult)); case cir::TEK_Complex: return RValue::getComplex(emitComplexExpr(e)); case cir::TEK_Aggregate: { diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp index 3d3030ca87e2a..201fb73983155 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp @@ -343,8 +343,8 @@ class AggExprEmitter : public StmtVisitor { cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitNoInitExpr"); } void VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) { - cgf.cgm.errorNYI(dae->getSourceRange(), - "AggExprEmitter: VisitCXXDefaultArgExpr"); + CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae); + Visit(dae->getExpr()); } void VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *e) { cgf.cgm.errorNYI(e->getSourceRange(), diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 119314fe27dce..5eba5ba6c3df1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -78,11 +78,15 @@ struct BinOpInfo { class ScalarExprEmitter : public StmtVisitor { CIRGenFunction &cgf; CIRGenBuilderTy &builder; + // Unlike classic codegen we set this to false or use std::exchange to read + // the value instead of calling TestAndClearIgnoreResultAssign to make it + // explicit when the value is used bool ignoreResultAssign; public: - ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder) - : cgf(cgf), builder(builder) {} + ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder, + bool ignoreResultAssign = false) + : cgf(cgf), builder(builder), ignoreResultAssign(ignoreResultAssign) {} //===--------------------------------------------------------------------===// // Utilities @@ -221,6 +225,8 @@ class ScalarExprEmitter : public StmtVisitor { } mlir::Value VisitArraySubscriptExpr(ArraySubscriptExpr *e) { + ignoreResultAssign = false; + if (e->getBase()->getType()->isVectorType()) { assert(!cir::MissingFeatures::scalableVectors()); @@ -839,6 +845,7 @@ class ScalarExprEmitter : public StmtVisitor { BinOpInfo emitBinOps(const BinaryOperator *e, QualType promotionType = QualType()) { + ignoreResultAssign = false; BinOpInfo result; result.lhs = cgf.emitPromotedScalarExpr(e->getLHS(), promotionType); result.rhs = cgf.emitPromotedScalarExpr(e->getRHS(), promotionType); @@ -924,6 +931,7 @@ class ScalarExprEmitter : public StmtVisitor { #undef HANDLEBINOP mlir::Value emitCmp(const BinaryOperator *e) { + ignoreResultAssign = false; const mlir::Location loc = cgf.getLoc(e->getExprLoc()); mlir::Value result; QualType lhsTy = e->getLHS()->getType(); @@ -1406,11 +1414,13 @@ CIRGenFunction::emitCompoundAssignmentLValue(const CompoundAssignOperator *e) { } /// Emit the computation of the specified expression of scalar type. -mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e) { +mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e, + bool ignoreResultAssign) { assert(e && hasScalarEvaluationKind(e->getType()) && "Invalid scalar expression to emit"); - return ScalarExprEmitter(*this, builder).Visit(const_cast(e)); + return ScalarExprEmitter(*this, builder, ignoreResultAssign) + .Visit(const_cast(e)); } mlir::Value CIRGenFunction::emitPromotedScalarExpr(const Expr *e, @@ -2054,6 +2064,11 @@ mlir::Value ScalarExprEmitter::VisitMemberExpr(MemberExpr *e) { mlir::Value ScalarExprEmitter::VisitInitListExpr(InitListExpr *e) { const unsigned numInitElements = e->getNumInits(); + [[maybe_unused]] const bool ignore = std::exchange(ignoreResultAssign, false); + assert((ignore == false || + (numInitElements == 0 && e->getType()->isVoidType())) && + "init list ignored"); + if (e->hadArrayRangeDesignator()) { cgf.cgm.errorNYI(e->getSourceRange(), "ArrayRangeDesignator"); return {}; diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index e5cecaa573a6e..1c52a78d72e33 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -1271,6 +1271,9 @@ class CIRGenFunction : public CIRGenTypeCache { RValue emitAtomicExpr(AtomicExpr *e); void emitAtomicInit(Expr *init, LValue dest); + void emitAtomicStore(RValue rvalue, LValue dest, bool isInit); + void emitAtomicStore(RValue rvalue, LValue dest, cir::MemOrder order, + bool isVolatile, bool isInit); AutoVarEmission emitAutoVarAlloca(const clang::VarDecl &d, mlir::OpBuilder::InsertPoint ip = {}); @@ -1501,7 +1504,8 @@ class CIRGenFunction : public CIRGenTypeCache { llvm::ArrayRef args = {}); /// Emit the computation of the specified expression of scalar type. - mlir::Value emitScalarExpr(const clang::Expr *e); + mlir::Value emitScalarExpr(const clang::Expr *e, + bool ignoreResultAssign = false); mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv, cir::UnaryOpKind kind, bool isPre); @@ -1679,8 +1683,8 @@ class CIRGenFunction : public CIRGenTypeCache { bool isInit); void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile, - clang::QualType ty, bool isInit = false, - bool isNontemporal = false); + clang::QualType ty, LValueBaseInfo baseInfo, + bool isInit = false, bool isNontemporal = false); void emitStoreOfScalar(mlir::Value value, LValue lvalue, bool isInit); /// Store the specified rvalue into the specified diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h index 4198c23c9cbed..9005b0106b2a4 100644 --- a/clang/lib/CIR/CodeGen/EHScopeStack.h +++ b/clang/lib/CIR/CodeGen/EHScopeStack.h @@ -155,6 +155,9 @@ class EHScopeStack { /// The innermost normal cleanup on the stack. stable_iterator innermostNormalCleanup = stable_end(); + /// The innermost EH scope on the stack. + stable_iterator innermostEHScope = stable_end(); + /// The CGF this Stack belong to CIRGenFunction *cgf = nullptr; @@ -226,6 +229,8 @@ class EHScopeStack { } stable_iterator getInnermostActiveNormalCleanup() const; + stable_iterator getInnermostEHScope() const { return innermostEHScope; } + /// An unstable reference to a scope-stack depth. Invalidated by /// pushes but not pops. class iterator; @@ -233,6 +238,9 @@ class EHScopeStack { /// Returns an iterator pointing to the innermost EH scope. iterator begin() const; + /// Returns an iterator pointing to the outermost EH scope. + iterator end() const; + /// Create a stable reference to the top of the EH stack. The /// returned reference is valid until that scope is popped off the /// stack. diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 5a6193fa8d840..ba967a43ce59a 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -194,6 +194,14 @@ mlir::LogicalResult CIRToLLVMCosOpLowering::matchAndRewrite( return mlir::success(); } +mlir::LogicalResult CIRToLLVMExpOpLowering::matchAndRewrite( + cir::ExpOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + mlir::Type resTy = typeConverter->convertType(op.getType()); + rewriter.replaceOpWithNewOp(op, resTy, adaptor.getSrc()); + return mlir::success(); +} + static mlir::Value getLLVMIntCast(mlir::ConversionPatternRewriter &rewriter, mlir::Value llvmSrc, mlir::Type llvmDstIntTy, bool isUnsigned, uint64_t cirSrcWidth, @@ -1336,6 +1344,14 @@ mlir::LogicalResult CIRToLLVMATanOpLowering::matchAndRewrite( return mlir::success(); } +mlir::LogicalResult CIRToLLVMCeilOpLowering::matchAndRewrite( + cir::CeilOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + mlir::Type resTy = typeConverter->convertType(op.getType()); + rewriter.replaceOpWithNewOp(op, resTy, adaptor.getSrc()); + return mlir::success(); +} + mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite( cir::AllocaOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index fbf4a5722caed..b6928ce7d9c44 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -160,6 +160,57 @@ static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) { return LastInst; } +static Value *handleElementwiseF16ToF32(CodeGenFunction &CGF, + const CallExpr *E) { + Value *Op0 = CGF.EmitScalarExpr(E->getArg(0)); + QualType Op0Ty = E->getArg(0)->getType(); + llvm::Type *ResType = CGF.FloatTy; + uint64_t NumElements = 0; + if (Op0->getType()->isVectorTy()) { + NumElements = + E->getArg(0)->getType()->castAs()->getNumElements(); + ResType = + llvm::VectorType::get(ResType, ElementCount::getFixed(NumElements)); + } + if (!Op0Ty->hasUnsignedIntegerRepresentation()) + llvm_unreachable( + "f16tof32 operand must have an unsigned int representation"); + + if (CGF.CGM.getTriple().isDXIL()) + return CGF.Builder.CreateIntrinsic(ResType, Intrinsic::dx_legacyf16tof32, + ArrayRef{Op0}, nullptr, + "hlsl.f16tof32"); + + if (CGF.CGM.getTriple().isSPIRV()) { + // We use the SPIRV UnpackHalf2x16 operation to avoid the need for the + // Int16 and Float16 capabilities + auto UnpackType = + llvm::VectorType::get(CGF.FloatTy, ElementCount::getFixed(2)); + if (NumElements == 0) { + // a scalar input - simply extract the first element of the unpacked + // vector + Value *Unpack = CGF.Builder.CreateIntrinsic( + UnpackType, Intrinsic::spv_unpackhalf2x16, ArrayRef{Op0}); + return CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0); + } else { + // a vector input - build a congruent output vector by iterating through + // the input vector calling unpackhalf2x16 for each element + Value *Result = PoisonValue::get(ResType); + for (uint64_t i = 0; i < NumElements; i++) { + Value *InVal = CGF.Builder.CreateExtractElement(Op0, i); + Value *Unpack = CGF.Builder.CreateIntrinsic( + UnpackType, Intrinsic::spv_unpackhalf2x16, + ArrayRef{InVal}); + Value *Res = CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0); + Result = CGF.Builder.CreateInsertElement(Result, Res, i); + } + return Result; + } + } + + llvm_unreachable("Intrinsic F16ToF32 not supported by target architecture"); +} + static Value *emitBufferStride(CodeGenFunction *CGF, const Expr *HandleExpr, LValue &Stride) { // Figure out the stride of the buffer elements from the handle type. @@ -579,6 +630,9 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, /*ReturnType=*/X->getType(), CGM.getHLSLRuntime().getDegreesIntrinsic(), ArrayRef{X}, nullptr, "hlsl.degrees"); } + case Builtin::BI__builtin_hlsl_elementwise_f16tof32: { + return handleElementwiseF16ToF32(*this, E); + } case Builtin::BI__builtin_hlsl_elementwise_frac: { Value *Op0 = EmitScalarExpr(E->getArg(0)); if (!E->getArg(0)->getType()->hasFloatingRepresentation()) diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp index 96f3f6221e20f..8ec8aef311656 100644 --- a/clang/lib/CodeGen/ModuleBuilder.cpp +++ b/clang/lib/CodeGen/ModuleBuilder.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/VirtualFileSystem.h" #include @@ -378,3 +379,31 @@ clang::CreateLLVMCodeGen(DiagnosticsEngine &Diags, llvm::StringRef ModuleName, HeaderSearchOpts, PreprocessorOpts, CGO, C, CoverageInfo); } + +namespace clang { +namespace CodeGen { +std::optional> +DemangleTrapReasonInDebugInfo(StringRef FuncName) { + static auto TrapRegex = + llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str()); + llvm::SmallVector Matches; + std::string *ErrorPtr = nullptr; +#ifndef NDEBUG + std::string Error; + ErrorPtr = &Error; +#endif + if (!TrapRegex.match(FuncName, &Matches, ErrorPtr)) { + assert(ErrorPtr && ErrorPtr->empty() && "Invalid regex pattern"); + return {}; + } + + if (Matches.size() != 3) { + assert(0 && "Expected 3 matches from Regex::match"); + return {}; + } + + // Returns { Trap Category, Trap Message } + return std::make_pair(Matches[1], Matches[2]); +} +} // namespace CodeGen +} // namespace clang diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 51618d17a4180..a0b82cec9a372 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -3857,6 +3857,9 @@ class OffloadingActionBuilder final { /// Flag set to true if all valid builders allow file bundling/unbundling. bool CanUseBundler; + /// Flag set to false if an argument turns off bundling. + bool ShouldUseBundler; + public: OffloadingActionBuilder(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs) @@ -3891,6 +3894,9 @@ class OffloadingActionBuilder final { } CanUseBundler = ValidBuilders && ValidBuilders == ValidBuildersSupportingBundling; + + ShouldUseBundler = Args.hasFlag(options::OPT_gpu_bundle_output, + options::OPT_no_gpu_bundle_output, true); } ~OffloadingActionBuilder() { @@ -4042,11 +4048,11 @@ class OffloadingActionBuilder final { SB->appendTopLevelActions(OffloadAL); } - // If we can use the bundler, replace the host action by the bundling one in - // the resulting list. Otherwise, just append the device actions. For - // device only compilation, HostAction is a null pointer, therefore only do - // this when HostAction is not a null pointer. - if (CanUseBundler && HostAction && + // If we can and should use the bundler, replace the host action by the + // bundling one in the resulting list. Otherwise, just append the device + // actions. For device only compilation, HostAction is a null pointer, + // therefore only do this when HostAction is not a null pointer. + if (CanUseBundler && ShouldUseBundler && HostAction && HostAction->getType() != types::TY_Nothing && !OffloadAL.empty()) { // Add the host action to the list in order to create the bundling action. OffloadAL.push_back(HostAction); @@ -6463,9 +6469,16 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, (JA.getOffloadingDeviceKind() == Action::OFK_OpenMP && TC && TC->getTriple().isAMDGPU())); }; - if (!AtTopLevel && JA.getType() == types::TY_LLVM_BC && - (C.getArgs().hasArg(options::OPT_emit_llvm) || - IsAMDRDCInCompilePhase(JA, C.getArgs()))) + + // The linker wrapper may not support the input and output files to be the + // same one, and without it -save-temps can fail. + bool IsLinkerWrapper = + JA.getType() == types::TY_Object && isa(JA); + bool IsEmitBitcode = JA.getType() == types::TY_LLVM_BC && + (C.getArgs().hasArg(options::OPT_emit_llvm) || + IsAMDRDCInCompilePhase(JA, C.getArgs())); + + if (!AtTopLevel && (IsLinkerWrapper || IsEmitBitcode)) Suffixed += ".tmp"; Suffixed += '.'; Suffixed += Suffix; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index d3ab6f1261ad6..30d3e5293a31b 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7879,10 +7879,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, !TC.getTriple().isAndroid() && TC.useIntegratedAs())) CmdArgs.push_back("-faddrsig"); - if ((Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) && + const bool HasDefaultDwarf2CFIASM = + (Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) && (EH || UnwindTables || AsyncUnwindTables || - DebugInfoKind != llvm::codegenoptions::NoDebugInfo)) - CmdArgs.push_back("-D__GCC_HAVE_DWARF2_CFI_ASM=1"); + DebugInfoKind != llvm::codegenoptions::NoDebugInfo); + if (Args.hasFlag(options::OPT_fdwarf2_cfi_asm, + options::OPT_fno_dwarf2_cfi_asm, HasDefaultDwarf2CFIASM)) + CmdArgs.push_back("-fdwarf2-cfi-asm"); if (Arg *A = Args.getLastArg(options::OPT_fsymbol_partition_EQ)) { std::string Str = A->getAsString(Args); diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp index f24b8ab14bdce..406c77cb3ae8f 100644 --- a/clang/lib/Format/WhitespaceManager.cpp +++ b/clang/lib/Format/WhitespaceManager.cpp @@ -591,7 +591,8 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches, CurrentChangeWidthRight = CurrentChange.TokenLength; const FormatToken *MatchingParenToEncounter = nullptr; for (unsigned J = I + 1; - J != E && (Changes[J].NewlinesBefore == 0 || MatchingParenToEncounter); + J != E && (Changes[J].NewlinesBefore == 0 || + MatchingParenToEncounter || Changes[J].IsAligned); ++J) { const auto &Change = Changes[J]; const auto *Tok = Change.Tok; diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp index 15fa7de35df97..93e012b163878 100644 --- a/clang/lib/Frontend/DependencyFile.cpp +++ b/clang/lib/Frontend/DependencyFile.cpp @@ -75,6 +75,17 @@ struct DepCollectorPPCallbacks : public PPCallbacks { /*IsMissing*/ false); } + bool EmbedFileNotFound(StringRef FileName) override { + DepCollector.maybeAddDependency( + llvm::sys::path::remove_leading_dotslash(FileName), + /*FromModule=*/false, + /*IsSystem=*/false, + /*IsModuleFile=*/false, + /*IsMissing=*/true); + // Return true to silence the file not found diagnostic. + return true; + } + void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index ed3f1f93d25d3..b88d9f89c5f71 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1516,6 +1516,9 @@ static void InitializePredefinedMacros(const TargetInfo &TI, if (LangOpts.PointerAuthIntrinsics) Builder.defineMacro("__PTRAUTH__"); + if (CGOpts.Dwarf2CFIAsm) + Builder.defineMacro("__GCC_HAVE_DWARF2_CFI_ASM"); + // Get other target #defines. TI.getTargetDefines(LangOpts, Builder); } diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp index aea3e72d92a84..10032184b5d94 100644 --- a/clang/lib/Frontend/TextDiagnostic.cpp +++ b/clang/lib/Frontend/TextDiagnostic.cpp @@ -349,14 +349,13 @@ struct SourceColumnMap { /// When the source code line we want to print is too long for /// the terminal, select the "interesting" region. -static void selectInterestingSourceRegion(std::string &SourceLine, - std::string &CaretLine, - std::string &FixItInsertionLine, - Columns NonGutterColumns, - const SourceColumnMap &Map) { - Columns CaretColumns = Columns(CaretLine.size()); - Columns FixItColumns = - Columns(llvm::sys::locale::columnWidth(FixItInsertionLine)); +static void selectInterestingSourceRegion( + std::string &SourceLine, std::string &CaretLine, + std::string &FixItInsertionLine, Columns NonGutterColumns, + const SourceColumnMap &Map, + SmallVectorImpl &Styles) { + Columns CaretColumns = CaretLine.size(); + Columns FixItColumns = llvm::sys::locale::columnWidth(FixItInsertionLine); Columns MaxColumns = std::max({Map.columns().V, CaretColumns.V, FixItColumns.V}); // if the number of columns is less than the desired number we're done @@ -369,13 +368,11 @@ static void selectInterestingSourceRegion(std::string &SourceLine, // Find the slice that we need to display the full caret line // correctly. Columns CaretStart = 0, CaretEnd = CaretLine.size(); - for (; CaretStart != CaretEnd; CaretStart = CaretStart.next()) - if (!isWhitespace(CaretLine[CaretStart.V])) - break; + while (CaretStart != CaretEnd && isWhitespace(CaretLine[CaretStart.V])) + CaretStart = CaretStart.next(); - for (; CaretEnd != CaretStart; CaretEnd = CaretEnd.prev()) - if (!isWhitespace(CaretLine[CaretEnd.V - 1])) - break; + while (CaretEnd != CaretStart && isWhitespace(CaretLine[CaretEnd.V])) + CaretEnd = CaretEnd.prev(); // caret has already been inserted into CaretLine so the above whitespace // check is guaranteed to include the caret @@ -516,13 +513,45 @@ static void selectInterestingSourceRegion(std::string &SourceLine, assert(FrontColumnsRemoved + ColumnsKept + BackColumnsRemoved > NonGutterColumns); + // Since we've modified the SourceLine, we also need to adjust the line's + // highlighting information. In particular, if we've removed + // from the front of the line, we need to move the style ranges to the + // left and remove unneeded ranges. + // Note in particular that variables like CaretEnd are defined in the + // CaretLine, which only contains ASCII, while the style ranges are defined in + // the source line, where we have to care for the byte-index != column-index + // case. + Bytes BytesRemoved = + FrontColumnsRemoved > FrontEllipse.size() + ? (Map.columnToByte(FrontColumnsRemoved) - Bytes(FrontEllipse.size())) + : 0; + Bytes CodeEnd = + CaretEnd < Map.columns() ? Map.columnToByte(CaretEnd.V) : CaretEnd.V; + for (TextDiagnostic::StyleRange &R : Styles) { + // Remove style ranges before and after the new truncated snippet. + if (R.Start >= static_cast(CodeEnd.V) || + R.End < static_cast(BytesRemoved.V)) { + R.Start = R.End = std::numeric_limits::max(); + continue; + } + // Move them left. (Note that this can wrap R.Start, but that doesn't + // matter). + R.Start -= BytesRemoved.V; + R.End -= BytesRemoved.V; + + // Don't leak into the ellipse at the end. + if (R.Start < static_cast(CodeEnd.V) && + R.End > static_cast(CodeEnd.V)) + R.End = CodeEnd.V + 1; // R.End is inclusive. + } + // The line needs some truncation, and we'd prefer to keep the front // if possible, so remove the back if (BackColumnsRemoved > Columns(BackEllipse.size())) SourceLine.replace(SourceEnd.V, std::string::npos, BackEllipse); // If that's enough then we're done - if (FrontColumnsRemoved + ColumnsKept <= Columns(NonGutterColumns)) + if (FrontColumnsRemoved + ColumnsKept <= NonGutterColumns) return; // Otherwise remove the front as well @@ -1391,6 +1420,11 @@ void TextDiagnostic::emitSnippetAndCaret( OS.indent(MaxLineNoDisplayWidth + 2) << "| "; }; + Columns MessageLength = DiagOpts.MessageLength; + // If we don't have enough columns available, just abort now. + if (MessageLength != 0 && MessageLength <= Columns(MaxLineNoDisplayWidth + 4)) + return; + // Prepare source highlighting information for the lines we're about to // emit, starting from the first line. std::unique_ptr[]> SourceStyles = @@ -1450,10 +1484,14 @@ void TextDiagnostic::emitSnippetAndCaret( // If the source line is too long for our terminal, select only the // "interesting" source region within that line. - Columns MessageLength = DiagOpts.MessageLength; - if (MessageLength.V != 0) + if (MessageLength != 0) { + Columns NonGutterColumns = MessageLength; + if (MaxLineNoDisplayWidth != 0) + NonGutterColumns -= Columns(MaxLineNoDisplayWidth + 4); selectInterestingSourceRegion(SourceLine, CaretLine, FixItInsertionLine, - MessageLength, SourceColMap); + NonGutterColumns, SourceColMap, + SourceStyles[LineNo - Lines.first]); + } // If we are in -fdiagnostics-print-source-range-info mode, we are trying // to produce easily machine parsable output. Add a space before the @@ -1508,7 +1546,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine, // Print the source line one character at a time. bool PrintReversed = false; std::optional CurrentColor; - size_t I = 0; + size_t I = 0; // Bytes. while (I < SourceLine.size()) { auto [Str, WasPrintable] = printableTextForNextCharacter(SourceLine, &I, DiagOpts.TabStop); diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h index 37ebc4f46a826..46ec12a63ef9c 100644 --- a/clang/lib/Headers/avx10_2_512bf16intrin.h +++ b/clang/lib/Headers/avx10_2_512bf16intrin.h @@ -24,6 +24,12 @@ typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1))); __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \ __min_vector_width__(512))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr +#else +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 +#endif + static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) { return __builtin_bit_cast(__m512bh, _mm512_setzero_ps()); } @@ -167,7 +173,7 @@ _mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) { (__v32bf)__A); } -static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) { return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); @@ -555,6 +561,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh( (__v32bf)_mm512_setzero_pbh()); } +#undef __DEFAULT_FN_ATTRS512_CONSTEXPR #undef __DEFAULT_FN_ATTRS512 #endif diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h index 765cd682986b4..8fb8cd7cd0865 100644 --- a/clang/lib/Headers/avx10_2bf16intrin.h +++ b/clang/lib/Headers/avx10_2bf16intrin.h @@ -27,6 +27,14 @@ typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1))); __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"), \ __min_vector_width__(128))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) { return __builtin_bit_cast(__m256bh, _mm256_setzero_ps()); } @@ -287,13 +295,13 @@ _mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) { (__v16bf)__A); } -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) { return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi)__B); } -static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) { return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); @@ -1080,6 +1088,7 @@ _mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 - +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif #endif diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index ac75b6ccde735..aab1f2b61ab8a 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -969,35 +969,31 @@ _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectw_512(__U, (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), (__v32hi)_mm512_setzero_si512()); diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 18c4a44a4c76e..5fc0afa49ce4c 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -3059,69 +3059,61 @@ _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) /* Vector permutations */ -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I, (__v16si) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I, (__v8di) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectq_512(__U, (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), (__v8di)_mm512_setzero_si512()); @@ -5949,71 +5941,66 @@ _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) (__v16sf)_mm512_setzero_ps()); } -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) -{ +static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) { return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I, (__v8df)__B); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) -{ +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)__A); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, - __m512d __B) -{ + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)(__m512d)__I); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, - __m512d __B) -{ + __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512(__U, (__v8df)_mm512_permutex2var_pd(__A, __I, __B), (__v8df)_mm512_setzero_pd()); } -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) -{ +static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) { return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I, (__v16sf) __B); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)__A); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)(__m512)__I); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) -{ +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, + __m512 __B) { return (__m512)__builtin_ia32_selectps_512(__U, (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), (__v16sf)_mm512_setzero_ps()); } - #define _mm512_cvtt_roundpd_epu32(A, R) \ ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_undefined_si256(), \ diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h index 142cc079c2c4b..25051228f3e0a 100644 --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -3316,13 +3316,13 @@ _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { (__v32hf)__A); } -static __inline__ __m512h __DEFAULT_FN_ATTRS512 +static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) { return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, (__v32hi)__B); } -static __inline__ __m512h __DEFAULT_FN_ATTRS512 +static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_permutexvar_ph(__m512i __A, __m512h __B) { return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); } diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h index 625a8ff66dc60..f73b607df797f 100644 --- a/clang/lib/Headers/avx512ifmaintrin.h +++ b/clang/lib/Headers/avx512ifmaintrin.h @@ -17,9 +17,8 @@ /* Define the default attributes for the functions in this file. */ #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS \ - constexpr \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \ - __min_vector_width__(512))) + __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \ + __min_vector_width__(512))) constexpr #else #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \ diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h index b377c17166ffb..51d5210e5aa5d 100644 --- a/clang/lib/Headers/avx512ifmavlintrin.h +++ b/clang/lib/Headers/avx512ifmavlintrin.h @@ -18,13 +18,13 @@ /* Define the default attributes for the functions in this file. */ #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS128 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512ifma,avx512vl"), \ - __min_vector_width__(128))) + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512ifma,avx512vl"), \ + __min_vector_width__(128))) constexpr #define __DEFAULT_FN_ATTRS256 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512ifma,avx512vl"), \ - __min_vector_width__(256))) + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512ifma,avx512vl"), \ + __min_vector_width__(256))) constexpr #else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, \ @@ -34,7 +34,6 @@ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512ifma,avx512vl"), \ __min_vector_width__(256))) - #endif #if !(defined(__AVXIFMA__) || defined(__AVX512IFMA__)) diff --git a/clang/lib/Headers/avx512vbmiintrin.h b/clang/lib/Headers/avx512vbmiintrin.h index 964535c4c4900..84fda5c5849e8 100644 --- a/clang/lib/Headers/avx512vbmiintrin.h +++ b/clang/lib/Headers/avx512vbmiintrin.h @@ -19,59 +19,57 @@ __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), \ __min_vector_width__(512))) -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) -{ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I, (__v64qi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, - __m512i __B) -{ + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512(__U, (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutexvar_epi8 (__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_permutexvar_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_permutexvar_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_permutexvar_epi8(__A, __B), (__v64qi)__W); @@ -99,8 +97,6 @@ _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), (__v64qi)_mm512_setzero_si512()); } - - +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #undef __DEFAULT_FN_ATTRS - #endif diff --git a/clang/lib/Headers/avx512vbmivlintrin.h b/clang/lib/Headers/avx512vbmivlintrin.h index 4c50be7d9e7e5..58a48dadff863 100644 --- a/clang/lib/Headers/avx512vbmivlintrin.h +++ b/clang/lib/Headers/avx512vbmivlintrin.h @@ -24,117 +24,110 @@ __target__("avx512vbmi,avx512vl"), \ __min_vector_width__(256))) -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) -{ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A, (__v16qi)__I, (__v16qi)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)__I); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128(__U, (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I, (__v32qi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)__I); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256(__U, (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutexvar_epi8 (__m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutexvar_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, (__v16qi)_mm_permutexvar_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_epi8 (__m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutexvar_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, (__v32qi)_mm256_permutexvar_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, (__v32qi)_mm256_permutexvar_epi8(__A, __B), (__v32qi)__W); @@ -186,7 +179,8 @@ _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) (__v32qi)_mm256_setzero_si256()); } - +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 263a1079b26d5..575c0c8962662 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -1223,69 +1223,61 @@ _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi) __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) -{ + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)__I); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { return (__m128i)__builtin_ia32_selectw_128(__U, (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, - __m256i __B) -{ + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)__I); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I, - __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I, + __m256i __B) { return (__m256i)__builtin_ia32_selectw_256(__U, (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), (__v16hi)_mm256_setzero_si256()); diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h index 5b2b3f0d0bbd4..885231b030b23 100644 --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -2010,24 +2010,24 @@ _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { (__v16hf)__A); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, (__v8hi)__B); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, (__v16hi)__B); } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutexvar_ph(__m128i __A, __m128h __B) { return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutexvar_ph(__m256i __A, __m256h __B) { return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); } diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 92bb444aeb5b8..e5249926b934e 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -3556,13 +3556,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I, (__v4si)__B); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3570,7 +3570,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)__A); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3578,7 +3578,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)__I); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, @@ -3586,13 +3586,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4si)_mm_setzero_si128()); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I, (__v8si) __B); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3600,7 +3600,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)__A); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3608,7 +3608,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)__I); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, @@ -3616,40 +3616,43 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8si)_mm256_setzero_si256()); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I, (__v2df)__B); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)__A); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)(__m128d)__I); } - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { + static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR + _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, + __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128(__U, (__v2df)_mm_permutex2var_pd(__A, __I, __B), (__v2df)_mm_setzero_pd()); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I, (__v4df)__B); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3657,7 +3660,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)__A); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3665,7 +3668,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)(__m256d)__I); } - static __inline__ __m256d __DEFAULT_FN_ATTRS256 + static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256(__U, @@ -3673,47 +3676,48 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4df)_mm256_setzero_pd()); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I, (__v4sf)__B); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)__A); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)(__m128)__I); } - static __inline__ __m128 __DEFAULT_FN_ATTRS128 + static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_selectps_128(__U, (__v4sf)_mm_permutex2var_ps(__A, __I, __B), (__v4sf)_mm_setzero_ps()); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I, (__v8sf) __B); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR + _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, + __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), (__v8sf)__A); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U, __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, @@ -3721,7 +3725,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)(__m256)__I); } - static __inline__ __m256 __DEFAULT_FN_ATTRS256 + static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) { return (__m256)__builtin_ia32_selectps_256(__U, @@ -3729,13 +3733,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v8sf)_mm256_setzero_ps()); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I, (__v2di)__B); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3743,7 +3747,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)__A); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3751,7 +3755,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)__I); } - static __inline__ __m128i __DEFAULT_FN_ATTRS128 + static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128(__U, @@ -3759,14 +3763,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v2di)_mm_setzero_si128()); } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I, (__v4di) __B); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, @@ -3774,7 +3777,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4di)__A); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, @@ -3782,7 +3785,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__v4di)__I); } - static __inline__ __m256i __DEFAULT_FN_ATTRS256 + static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256(__U, diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h index e452d5f0920e9..30df01caed6cf 100644 --- a/clang/lib/Headers/avxifmaintrin.h +++ b/clang/lib/Headers/avxifmaintrin.h @@ -17,11 +17,11 @@ /* Define the default attributes for the functions in this file. */ #if defined(__cplusplus) && (__cplusplus >= 201103L) #define __DEFAULT_FN_ATTRS128 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avxifma"), __min_vector_width__(128))) + __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ + __min_vector_width__(128))) constexpr #define __DEFAULT_FN_ATTRS256 \ - constexpr __attribute__((__always_inline__, __nodebug__, \ - __target__("avxifma"), __min_vector_width__(256))) + __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ + __min_vector_width__(256))) constexpr #else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h index a918af39e4074..208776eb7840e 100644 --- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h @@ -1052,6 +1052,27 @@ float3 exp2(float3); _HLSL_BUILTIN_ALIAS(__builtin_elementwise_exp2) float4 exp2(float4); +//===----------------------------------------------------------------------===// +// f16tof32 builtins +//===----------------------------------------------------------------------===// + +/// \fn float f16tof32(uint x) +/// \brief Returns the half value stored in the low 16 bits of the uint arg +/// converted to a float. +/// \param x The uint containing two half values. +/// +/// The float value of the half value found in the low 16 bits of the \a xi +/// parameter. + +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float f16tof32(uint); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float2 f16tof32(uint2); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float3 f16tof32(uint3); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32) +float4 f16tof32(uint4); + //===----------------------------------------------------------------------===// // firstbithigh builtins //===----------------------------------------------------------------------===// @@ -2090,9 +2111,17 @@ T select(bool, T, T); /// \param FalseVals The vector values are chosen from when conditions are /// false. -template +template _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -vector select(vector, vector, vector); +vector select(vector, vector, vector); + +template +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector select(vector, vector, vector); + +template +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector select(vector, vector, vector); /// \fn vector select(vector Conds, T TrueVal, /// vector FalseVals) @@ -2102,9 +2131,17 @@ vector select(vector, vector, vector); /// \param FalseVals The vector values are chosen from when conditions are /// false. -template +template +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector select(vector, T, vector); + +template +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector select(vector, T, vector); + +template _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -vector select(vector, T, vector); +vector select(vector, T, vector); /// \fn vector select(vector Conds, vector TrueVals, /// T FalseVal) @@ -2113,9 +2150,17 @@ vector select(vector, T, vector); /// \param TrueVals The vector values are chosen from when conditions are true. /// \param FalseVal The scalar value to splat from when conditions are false. -template +template _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -vector select(vector, vector, T); +vector select(vector, vector, T); + +template +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector select(vector, vector, T); + +template +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +vector select(vector, vector, T); /// \fn vector select(vector Conds, vector TrueVals, /// T FalseVal) @@ -2124,10 +2169,20 @@ vector select(vector, vector, T); /// \param TrueVal The scalar value to splat from when conditions are true. /// \param FalseVal The scalar value to splat from when conditions are false. -template +template +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +__detail::enable_if_t<__detail::is_arithmetic::Value, vector> select( + vector, T, T); + +template +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) +__detail::enable_if_t<__detail::is_arithmetic::Value, vector> select( + vector, T, T); + +template _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select) -__detail::enable_if_t<__detail::is_arithmetic::Value, vector> select( - vector, T, T); +__detail::enable_if_t<__detail::is_arithmetic::Value, vector> select( + vector, T, T); //===----------------------------------------------------------------------===// // sin builtins diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap index 2e4d533356569..c13dd3fd48ac8 100644 --- a/clang/lib/Headers/module.modulemap +++ b/clang/lib/Headers/module.modulemap @@ -253,6 +253,11 @@ module _Builtin_stdbool [system] { export * } +module _Builtin_stdckdint [system] { + header "stdckdint.h" + export * +} + module _Builtin_stdcountof [system] { header "stdcountof.h" export * diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index 637a08fe4dcdb..b8202ea11be36 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -258,6 +258,7 @@ static bool isBuiltinHeaderName(StringRef FileName) { .Case("stdarg.h", true) .Case("stdatomic.h", true) .Case("stdbool.h", true) + .Case("stdckdint.h", true) .Case("stdcountof.h", true) .Case("stddef.h", true) .Case("stdint.h", true) diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 6a5e5d4bad3a6..891c8ab7f3155 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -4018,7 +4018,7 @@ void Preprocessor::HandleEmbedDirective(SourceLocation HashLoc, Token &EmbedTok, this->LookupEmbedFile(Filename, isAngled, true, LookupFromFile); if (!MaybeFileRef) { // could not find file - if (Callbacks && Callbacks->EmbedFileNotFound(OriginalFilename)) { + if (Callbacks && Callbacks->EmbedFileNotFound(Filename)) { return; } Diag(FilenameTok, diag::err_pp_file_not_found) << Filename; diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 7e4a164e34eda..5fcb659768655 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -5370,8 +5370,13 @@ void Parser::ParseEnumBody(SourceLocation StartLoc, Decl *EnumDecl, T.consumeOpen(); // C does not allow an empty enumerator-list, C++ does [dcl.enum]. - if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus) - Diag(Tok, diag::err_empty_enum); + if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus) { + if (getLangOpts().MicrosoftExt) + Diag(T.getOpenLocation(), diag::ext_ms_c_empty_enum_type) + << SourceRange(T.getOpenLocation(), Tok.getLocation()); + else + Diag(Tok, diag::err_empty_enum); + } SmallVector EnumConstantDecls; SmallVector EnumAvailabilityDiags; diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index 92038985f9163..fb45db1139349 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -813,7 +813,7 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx, return StmtError(); } } else { - LHS = Expr; + LHS = Actions.ActOnCaseExpr(CaseLoc, Expr); MissingCase = false; } diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 140b709dbb651..0997966f29f5e 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -29,7 +29,9 @@ #include "clang/Analysis/Analyses/CFGReachabilityAnalysis.h" #include "clang/Analysis/Analyses/CalledOnceCheck.h" #include "clang/Analysis/Analyses/Consumed.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Facts.h" #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h" +#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h" #include "clang/Analysis/Analyses/ReachableCode.h" #include "clang/Analysis/Analyses/ThreadSafety.h" #include "clang/Analysis/Analyses/UninitializedValues.h" @@ -52,7 +54,9 @@ #include "llvm/ADT/STLFunctionalExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -3065,7 +3069,11 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( if (EnableLifetimeSafetyAnalysis && S.getLangOpts().CPlusPlus) { if (AC.getCFG()) { lifetimes::LifetimeSafetyReporterImpl LifetimeSafetyReporter(S); - lifetimes::runLifetimeSafetyAnalysis(AC, &LifetimeSafetyReporter); + std::unique_ptr + Analysis = + lifetimes::runLifetimeSafetyAnalysis(AC, &LifetimeSafetyReporter); + if (S.CollectStats) + FindMissingOrigins(AC, Analysis->getFactManager()); } } // Check for violations of "called once" parameter properties. @@ -3131,9 +3139,28 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( } } +void clang::sema::AnalysisBasedWarnings::FindMissingOrigins( + AnalysisDeclContext &AC, lifetimes::internal::FactManager &FactMgr) { + if (AC.getCFG()) { + for (const auto &[expr, count] : + FactMgr.getOriginMgr().getMissingOrigins()) { + MissingOriginCount[expr] += count; + } + } +} + void clang::sema::AnalysisBasedWarnings::PrintStats() const { + // clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs()); + llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats " + "(expression_type : count) :\n"; + unsigned totalMissingOrigins = 0; + for (const auto &[expr, count] : MissingOriginCount) { + llvm::errs() << expr << " : " << count << '\n'; + totalMissingOrigins += count; + } + llvm::errs() << "Total missing origins: " << totalMissingOrigins << "\n"; + llvm::errs() << "****************************************\n"; llvm::errs() << "\n*** Analysis Based Warnings Stats:\n"; - unsigned NumCFGsBuilt = NumFunctionsAnalyzed - NumFunctionsWithBadCFGs; unsigned AvgCFGBlocksPerFunction = !NumCFGsBuilt ? 0 : NumCFGBlocks/NumCFGsBuilt; diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp index 8590ee831084f..4b63eb7df1054 100644 --- a/clang/lib/Sema/SemaFunctionEffects.cpp +++ b/clang/lib/Sema/SemaFunctionEffects.cpp @@ -1208,8 +1208,16 @@ class Analyzer { return true; } - // No Decl, just an Expr. Just check based on its type. - checkIndirectCall(Call, CalleeExpr->getType()); + // No Decl, just an Expr. Just check based on its type. Bound member + // functions are a special expression type and need to be specially + // unpacked. + QualType CalleeExprQT = CalleeExpr->getType(); + if (CalleeExpr->isBoundMemberFunction(Outer.S.getASTContext())) { + QualType QT = Expr::findBoundMemberType(CalleeExpr); + if (!QT.isNull()) + CalleeExprQT = QT; + } + checkIndirectCall(Call, CalleeExprQT); return true; } @@ -1271,7 +1279,15 @@ class Analyzer { const CXXConstructorDecl *Ctor = Construct->getConstructor(); CallableInfo CI(*Ctor); followCall(CI, Construct->getLocation()); + return true; + } + bool VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *BTE) override { + const CXXDestructorDecl *Dtor = BTE->getTemporary()->getDestructor(); + if (Dtor != nullptr) { + CallableInfo CI(*Dtor); + followCall(CI, BTE->getBeginLoc()); + } return true; } diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 94a490a8f68dc..b9707f0036765 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -2802,6 +2802,23 @@ static bool CheckUnsignedIntRepresentation(Sema *S, SourceLocation Loc, return false; } +static bool CheckExpectedBitWidth(Sema *S, CallExpr *TheCall, + unsigned ArgOrdinal, unsigned Width) { + QualType ArgTy = TheCall->getArg(0)->getType(); + if (auto *VTy = ArgTy->getAs()) + ArgTy = VTy->getElementType(); + // ensure arg type has expected bit width + uint64_t ElementBitCount = + S->getASTContext().getTypeSizeInChars(ArgTy).getQuantity() * 8; + if (ElementBitCount != Width) { + S->Diag(TheCall->getArg(0)->getBeginLoc(), + diag::err_integer_incorrect_bit_count) + << Width << ElementBitCount; + return true; + } + return false; +} + static void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall, QualType ReturnType) { auto *VecTyA = TheCall->getArg(0)->getType()->getAs(); @@ -2961,24 +2978,16 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { CheckUnsignedIntVecRepresentation)) return true; - auto *VTy = TheCall->getArg(0)->getType()->getAs(); // ensure arg integers are 32-bits - uint64_t ElementBitCount = getASTContext() - .getTypeSizeInChars(VTy->getElementType()) - .getQuantity() * - 8; - if (ElementBitCount != 32) { - SemaRef.Diag(TheCall->getBeginLoc(), - diag::err_integer_incorrect_bit_count) - << 32 << ElementBitCount; + if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32)) return true; - } // ensure both args are vectors of total bit size of a multiple of 64 + auto *VTy = TheCall->getArg(0)->getType()->getAs(); int NumElementsArg = VTy->getNumElements(); if (NumElementsArg != 2 && NumElementsArg != 4) { SemaRef.Diag(TheCall->getBeginLoc(), diag::err_vector_incorrect_bit_count) - << 1 /*a multiple of*/ << 64 << NumElementsArg * ElementBitCount; + << 1 /*a multiple of*/ << 64 << NumElementsArg * 32; return true; } @@ -3295,7 +3304,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { break; } // Note these are llvm builtins that we want to catch invalid intrinsic - // generation. Normal handling of these builitns will occur elsewhere. + // generation. Normal handling of these builtins will occur elsewhere. case Builtin::BI__builtin_elementwise_bitreverse: { // does not include a check for number of arguments // because that is done previously @@ -3405,6 +3414,30 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { } break; } + case Builtin::BI__builtin_hlsl_elementwise_f16tof32: { + if (SemaRef.checkArgCount(TheCall, 1)) + return true; + if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall, + CheckUnsignedIntRepresentation)) + return true; + // ensure arg integers are 32 bits + if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32)) + return true; + // check it wasn't a bool type + QualType ArgTy = TheCall->getArg(0)->getType(); + if (auto *VTy = ArgTy->getAs()) + ArgTy = VTy->getElementType(); + if (ArgTy->isBooleanType()) { + SemaRef.Diag(TheCall->getArg(0)->getBeginLoc(), + diag::err_builtin_invalid_arg_type) + << 1 << /* scalar or vector of */ 5 << /* unsigned int */ 3 + << /* no fp */ 0 << TheCall->getArg(0)->getType(); + return true; + } + + SetElementTypeAsReturnType(&SemaRef, TheCall, getASTContext().FloatTy); + break; + } } return false; } diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 073010d16b428..cc6ddf568d346 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -1897,26 +1897,29 @@ void InitListChecker::CheckMatrixType(const InitializedEntity &Entity, return; const ConstantMatrixType *MT = DeclType->castAs(); + + // For HLSL, the error reporting for this case is handled in SemaHLSL's + // initializer list diagnostics. That means the execution should require + // getNumElementsFlattened to equal getNumInits. In other words the execution + // should never reach this point if this condition is not true". + assert(IList->getNumInits() == MT->getNumElementsFlattened() && + "Inits must equal Matrix element count"); + QualType ElemTy = MT->getElementType(); - const unsigned MaxElts = MT->getNumElementsFlattened(); - unsigned NumEltsInit = 0; + Index = 0; InitializedEntity ElemEnt = InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity); - while (NumEltsInit < MaxElts && Index < IList->getNumInits()) { + while (Index < IList->getNumInits()) { // Not a sublist: just consume directly. - ElemEnt.setElementIndex(Index); - CheckSubElementType(ElemEnt, IList, ElemTy, Index, StructuredList, + unsigned ColMajorIndex = (Index % MT->getNumRows()) * MT->getNumColumns() + + (Index / MT->getNumRows()); + ElemEnt.setElementIndex(ColMajorIndex); + CheckSubElementType(ElemEnt, IList, ElemTy, ColMajorIndex, StructuredList, StructuredIndex); - ++NumEltsInit; + ++Index; } - - // For HLSL The error for this case is handled in SemaHLSL's initializer - // list diagnostics, That means the execution should require NumEltsInit - // to equal Max initializers. In other words execution should never - // reach this point if this condition is not true". - assert(NumEltsInit == MaxElts && "NumEltsInit must equal MaxElts"); } void InitListChecker::CheckVectorType(const InitializedEntity &Entity, diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index f39896336053e..5b3ef1adf38e3 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3281,6 +3281,9 @@ static Scope *FindLabeledBreakContinueScope(Sema &S, Scope *CurScope, SourceLocation LabelLoc, bool IsContinue) { assert(Target && "not a named break/continue?"); + + Target->markUsed(S.Context); + Scope *Found = nullptr; for (Scope *Scope = CurScope; Scope; Scope = Scope->getParent()) { if (Scope->isFunctionScope()) diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index ad50600f6399c..bfcd3978817ca 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -659,7 +659,8 @@ struct ConvertConstructorToDeductionGuideTransform { SemaRef, MaterializedTypedefs, NestedPattern, TransformingOuterPatterns ? &Args : nullptr) .transform(NewDI); - + if (!NewDI) + return nullptr; // Resolving a wording defect, we also inherit default arguments from the // constructor. ExprResult NewDefArg; diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 280b3c92cce14..c483930705057 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -2358,6 +2358,11 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr, return QualType(); } + if (VecSize->isNegative()) { + Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size); + return QualType(); + } + if (CurType->isDependentType()) return Context.getDependentVectorType(CurType, SizeExpr, AttrLoc, VectorKind::Generic); @@ -2394,7 +2399,7 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr, VectorKind::Generic); } -QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize, +QualType Sema::BuildExtVectorType(QualType T, Expr *SizeExpr, SourceLocation AttrLoc) { // Unlike gcc's vector_size attribute, we do not allow vectors to be defined // in conjunction with complex types (pointers, arrays, functions, etc.). @@ -2417,35 +2422,40 @@ QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize, BIT && CheckBitIntElementType(*this, AttrLoc, BIT)) return QualType(); - if (!ArraySize->isTypeDependent() && !ArraySize->isValueDependent()) { - std::optional vecSize = - ArraySize->getIntegerConstantExpr(Context); - if (!vecSize) { + if (!SizeExpr->isTypeDependent() && !SizeExpr->isValueDependent()) { + std::optional VecSize = + SizeExpr->getIntegerConstantExpr(Context); + if (!VecSize) { Diag(AttrLoc, diag::err_attribute_argument_type) - << "ext_vector_type" << AANT_ArgumentIntegerConstant - << ArraySize->getSourceRange(); + << "ext_vector_type" << AANT_ArgumentIntegerConstant + << SizeExpr->getSourceRange(); + return QualType(); + } + + if (VecSize->isNegative()) { + Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size); return QualType(); } - if (!vecSize->isIntN(32)) { + if (!VecSize->isIntN(32)) { Diag(AttrLoc, diag::err_attribute_size_too_large) - << ArraySize->getSourceRange() << "vector"; + << SizeExpr->getSourceRange() << "vector"; return QualType(); } // Unlike gcc's vector_size attribute, the size is specified as the // number of elements, not the number of bytes. - unsigned vectorSize = static_cast(vecSize->getZExtValue()); + unsigned VectorSize = static_cast(VecSize->getZExtValue()); - if (vectorSize == 0) { + if (VectorSize == 0) { Diag(AttrLoc, diag::err_attribute_zero_size) - << ArraySize->getSourceRange() << "vector"; + << SizeExpr->getSourceRange() << "vector"; return QualType(); } - return Context.getExtVectorType(T, vectorSize); + return Context.getExtVectorType(T, VectorSize); } - return Context.getDependentSizedExtVectorType(T, ArraySize, AttrLoc); + return Context.getDependentSizedExtVectorType(T, SizeExpr, AttrLoc); } QualType Sema::BuildMatrixType(QualType ElementTy, Expr *NumRows, Expr *NumCols, diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 3ac338e013deb..b1fd151790d96 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -4374,8 +4374,7 @@ class ASTDeclContextNameLookupTrait // parent of parent. We DON'T remove the enum constant from its parent. So // we don't need to care about merging problems here. if (auto *ECD = dyn_cast(D); - ECD && DC.isFileContext() && ECD->getOwningModule() && - ECD->getTopLevelOwningNamedModule()->isNamedModule()) { + ECD && DC.isFileContext() && ECD->getTopLevelOwningNamedModule()) { if (llvm::all_of( DC.noload_lookup( cast(ECD->getDeclContext())->getDeclName()), diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp index 2838533c1a406..4f4824a3616ce 100644 --- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp +++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp @@ -714,11 +714,6 @@ class RegionStoreManager : public StoreManager { return getBinding(getRegionBindings(S), L, T); } - std::optional getUniqueDefaultBinding(RegionBindingsConstRef B, - const TypedValueRegion *R) const; - std::optional - getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const; - std::optional getDefaultBinding(Store S, const MemRegion *R) override { RegionBindingsRef B = getRegionBindings(S); // Default bindings are always applied over a base region so look up the @@ -2465,11 +2460,6 @@ SVal RegionStoreManager::getBindingForStruct(RegionBindingsConstRef B, // behavior doesn't depend on the struct layout. // This way even an empty struct can carry taint, no matter if creduce drops // the last field member or not. - - // Try to avoid creating a LCV if it would anyways just refer to a single - // default binding. - if (std::optional Val = getUniqueDefaultBinding(B, R)) - return *Val; return createLazyBinding(B, R); } @@ -2757,50 +2747,12 @@ RegionStoreManager::bindVector(LimitedRegionBindingsConstRef B, return NewB; } -std::optional -RegionStoreManager::getUniqueDefaultBinding(RegionBindingsConstRef B, - const TypedValueRegion *R) const { - if (R != R->getBaseRegion()) - return std::nullopt; - - const auto *Cluster = B.lookup(R); - if (!Cluster || !llvm::hasSingleElement(*Cluster)) - return std::nullopt; - - const auto [Key, Value] = *Cluster->begin(); - return Key.isDirect() ? std::optional{} : Value; -} - -std::optional -RegionStoreManager::getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const { - auto B = getRegionBindings(LCV.getStore()); - return getUniqueDefaultBinding(B, LCV.getRegion()); -} - std::optional RegionStoreManager::tryBindSmallStruct( LimitedRegionBindingsConstRef B, const TypedValueRegion *R, const RecordDecl *RD, nonloc::LazyCompoundVal LCV) { if (B.hasExhaustedBindingLimit()) return B.withValuesEscaped(LCV); - // If we try to copy a Conjured value representing the value of the whole - // struct, don't try to element-wise copy each field. - // That would unnecessarily bind Derived symbols slicing off the subregion for - // the field from the whole Conjured symbol. - // - // struct Window { int width; int height; }; - // Window getWindow(); <-- opaque fn. - // Window w = getWindow(); <-- conjures a new Window. - // Window w2 = w; <-- trivial copy "w", calling "tryBindSmallStruct" - // - // We should not end up with a new Store for "w2" like this: - // Direct [ 0..31]: Derived{Conj{}, w.width} - // Direct [32..63]: Derived{Conj{}, w.height} - // Instead, we should just bind that Conjured value instead. - if (std::optional Val = getUniqueDefaultBinding(LCV)) { - return B.addBinding(BindingKey::Make(R, BindingKey::Default), Val.value()); - } - FieldVector Fields; if (const CXXRecordDecl *Class = dyn_cast(RD)) diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp index 427d3a106656b..e283a7b42e554 100644 --- a/clang/test/AST/ByteCode/cxx11.cpp +++ b/clang/test/AST/ByteCode/cxx11.cpp @@ -374,7 +374,7 @@ namespace GH150709 { namespace DiscardedAddrLabel { void foo(void) { L: - *&&L; // both-error {{indirection not permitted}} \ + *&&L; // both-error {{indirection not permitted on operand of type 'void *'}} \ // both-warning {{expression result unused}} } } diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp index 83f32c97c50c7..4799ebe25dde1 100644 --- a/clang/test/AST/ByteCode/records.cpp +++ b/clang/test/AST/ByteCode/records.cpp @@ -1882,3 +1882,14 @@ namespace MethodWillHaveBody { } int n = f(0); // both-note {{instantiation of}} } + +namespace StaticRedecl { + struct T { + static T tt; + constexpr T() : p(&tt) {} + T *p; + }; + T T::tt; + constexpr T t; + static_assert(t.p == &T::tt, ""); +} diff --git a/clang/test/AST/HLSL/matrix-constructors.hlsl b/clang/test/AST/HLSL/matrix-constructors.hlsl index 0a2f03c7c0fac..e1a9c53e2c602 100644 --- a/clang/test/AST/HLSL/matrix-constructors.hlsl +++ b/clang/test/AST/HLSL/matrix-constructors.hlsl @@ -9,21 +9,20 @@ typedef float float4 __attribute__((ext_vector_type(4))); [numthreads(1,1,1)] void ok() { - // CHECK: VarDecl 0x{{[0-9a-fA-F]+}} col:{{[0-9]+}} A 'float2x3':'matrix' cinit // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} 'float2x3':'matrix' functional cast to float2x3 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} 'float2x3':'matrix' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 5 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 6 float2x3 A = float2x3(1,2,3,4,5,6); @@ -57,6 +56,8 @@ void ok() { // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' xvalue @@ -68,12 +69,10 @@ void ok() { // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 5 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 6 float2x3 D = float2x3(float2(1,2), 3, 4, 5, 6); @@ -97,9 +96,9 @@ void ok() { // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' functional cast to float2 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' xvalue vectorcomponent @@ -107,10 +106,12 @@ void ok() { // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' functional cast to float2 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 5 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' xvalue @@ -120,9 +121,7 @@ void ok() { // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 5 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 6 float2x3 E = float2x3(float2(1,2), float2(3,4), 5, 6); @@ -158,7 +157,7 @@ void ok() { // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} 'float4':'vector' xvalue @@ -172,7 +171,9 @@ void ok() { // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 5 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} 'float4':'vector' xvalue @@ -186,9 +187,7 @@ void ok() { // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 3 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 5 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 6 float2x3 F = float2x3(float4(1,2,3,4), 5, 6); @@ -202,10 +201,10 @@ void ok() { // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 @@ -215,41 +214,41 @@ void ok() { // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' matrixcomponent // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' functional cast to float2x2 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 5 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' matrixcomponent // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' functional cast to float2x2 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 5 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 6 float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); @@ -262,13 +261,13 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 float2 Vec2 = float2(1.0, 2.0); float2x2 H = float2x2(Vec2,3,4); @@ -281,10 +280,10 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'i' 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int' +// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int' +// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'l' 'int' @@ -300,15 +299,15 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue .a 0x{{[0-9a-fA-F]+}} +// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S' +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' lvalue .f 0x{{[0-9a-fA-F]+}} // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue .a 0x{{[0-9a-fA-F]+}} -// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S' -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue .a 0x{{[0-9a-fA-F]+}} // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S' struct S { float2 f; float a;} s; float2x2 J = float2x2(s.f, s.a, s.a); @@ -317,8 +316,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' functional cast to float2x2 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} 'float' 1.000000e+00 -// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} 'float' 2.000000e+00 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} 'float' 3.000000e+00 +// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} 'float' 2.000000e+00 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} 'float' 4.000000e+00 typedef float2x2 second_level_of_typedefs; second_level_of_typedefs L = float2x2(1.0f, 2.0f, 3.0f, 4.0f); @@ -327,8 +326,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6); // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} 'second_level_of_typedefs':'matrix' functional cast to second_level_of_typedefs // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} 'second_level_of_typedefs':'matrix' // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} 'float' 1.000000e+00 -// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} 'float' 2.000000e+00 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} 'float' 3.000000e+00 +// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} 'float' 2.000000e+00 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} 'float' 4.000000e+00 float2x2 M = second_level_of_typedefs(1.0f, 2.0f, 3.0f, 4.0f); @@ -367,12 +366,12 @@ float2x1 GettingStrange = float2x1(s2, s2); // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' lvalue .f 0x{{[0-9a-fA-F]+}} // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' lvalue .f 0x{{[0-9a-fA-F]+}} // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} 'float2':'vector' lvalue .f 0x{{[0-9a-fA-F]+}} diff --git a/clang/test/AST/HLSL/matrix-general-initializer.hlsl b/clang/test/AST/HLSL/matrix-general-initializer.hlsl index 14c950acb7baf..1a631113eb0f0 100644 --- a/clang/test/AST/HLSL/matrix-general-initializer.hlsl +++ b/clang/test/AST/HLSL/matrix-general-initializer.hlsl @@ -26,14 +26,6 @@ void ok() { // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'vector' xxx // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'vector' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' -// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'int' xvalue vectorcomponent -// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} 'vector' xvalue -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'vector' xxx -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'vector' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' @@ -42,20 +34,8 @@ void ok() { // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'vector' xx // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'vector' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' -// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'int' xvalue vectorcomponent -// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} 'vector' xvalue -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'vector' xx -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'vector' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'int' x -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'vector' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 -// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'int' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} 'vector' xvalue @@ -66,10 +46,30 @@ void ok() { // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'int' xvalue vectorcomponent -// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} 'vector' xvalue -// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'vector' xx +// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} 'vector' xvalue +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'vector' xxx // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'vector' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' +// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'int' xvalue vectorcomponent +// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} 'vector' xvalue +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'vector' xx +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'vector' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 0 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'int' x +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'vector' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 3 +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' +// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'int' xvalue vectorcomponent +// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} 'vector' xvalue +// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'vector' xx +// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'vector' +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 4 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx}; @@ -84,12 +84,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx}; // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float4x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float4x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent @@ -105,12 +105,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx}; // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float4x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float4x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent @@ -138,7 +138,7 @@ S s = {m}; // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'vector' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'int' xvalue vectorcomponent @@ -146,7 +146,7 @@ S s = {m}; // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'vector' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <> '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} 'int' xvalue vectorcomponent @@ -169,25 +169,25 @@ float2x2 m2 = {0.xxxx}; // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' @@ -199,26 +199,26 @@ float2x2 m2 = {0.xxxx}; // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent @@ -229,25 +229,25 @@ float2x2 m2 = {0.xxxx}; // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} 'float' lvalue matrixcomponent // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} 'float2x2':'matrix' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix' -// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 0 +// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} 'int' 1 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'int' // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} 'float' diff --git a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt index ce37a29655668..2f9c2ac247497 100644 --- a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt +++ b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt @@ -15,6 +15,13 @@ set(LIFETIME_BENCHMARK_REQUIREMENTS set(LIFETIME_BENCHMARK_OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/benchmark_results") +if(WIN32) + set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE + "${LIFETIME_BENCHMARK_VENV_DIR}/Scripts/python") +else() + set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE + "${LIFETIME_BENCHMARK_VENV_DIR}/bin/python") +endif() if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREMENTS}) @@ -22,7 +29,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME add_custom_command( OUTPUT ${LIFETIME_BENCHMARK_VENV_DIR}/pyvenv.cfg COMMAND ${Python3_EXECUTABLE} -m venv ${LIFETIME_BENCHMARK_VENV_DIR} - COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS} + COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS} DEPENDS ${LIFETIME_BENCHMARK_REQUIREMENTS} COMMENT "Creating Python virtual environment and installing dependencies for benchmark..." ) @@ -32,7 +39,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME # Main benchmark target add_custom_target(benchmark_lifetime_safety_analysis - COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python ${LIFETIME_BENCHMARK_SCRIPT} + COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} ${LIFETIME_BENCHMARK_SCRIPT} --clang-binary ${LLVM_BINARY_DIR}/bin/clang --output-dir ${LIFETIME_BENCHMARK_OUTPUT_DIR} diff --git a/clang/test/Analysis/NewDelete-checker-test.cpp b/clang/test/Analysis/NewDelete-checker-test.cpp index c417b9c2ac97e..fd831cc0985cc 100644 --- a/clang/test/Analysis/NewDelete-checker-test.cpp +++ b/clang/test/Analysis/NewDelete-checker-test.cpp @@ -3,13 +3,13 @@ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDelete // -// RUN: %clang_analyze_cc1 -DLEAKS -std=c++11 -fblocks %s \ +// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \ // RUN: -verify=expected,newdelete,leak \ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDelete \ // RUN: -analyzer-checker=cplusplus.NewDeleteLeaks // -// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -verify %s \ +// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \ // RUN: -verify=expected,leak \ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDeleteLeaks @@ -19,13 +19,13 @@ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDelete // -// RUN: %clang_analyze_cc1 -DLEAKS -std=c++17 -fblocks %s \ +// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \ // RUN: -verify=expected,newdelete,leak \ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDelete \ // RUN: -analyzer-checker=cplusplus.NewDeleteLeaks // -// RUN: %clang_analyze_cc1 -std=c++17 -fblocks -verify %s \ +// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \ // RUN: -verify=expected,leak,inspection \ // RUN: -analyzer-checker=core \ // RUN: -analyzer-checker=cplusplus.NewDeleteLeaks \ @@ -503,3 +503,75 @@ namespace optional_union { custom_union_t a; } // leak-warning{{Potential leak of memory pointed to by 'a.present.q'}} } + +namespace gh153782 { + +// Ensure we do not regress on the following use case. + +namespace mutually_exclusive_test_case_1 { +struct StorageWrapper { + // Imagine the destructor and copy constructor both call a reset() function (among other things). + ~StorageWrapper() { delete parts; } + StorageWrapper(StorageWrapper const&) = default; + + // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find. + void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}} + + // Not provided, typically would do `parts = new long`. + StorageWrapper(); + + long* parts; +}; + +void test_non_trivial_struct_assignment() { + StorageWrapper* object = new StorageWrapper[]{StorageWrapper()}; + object[0] = StorageWrapper(); // This assignment leads to the double-free. +} +} // mutually_exclusive_test_case_1 + +namespace mutually_exclusive_test_case_2 { +struct StorageWrapper { + // Imagine the destructor and copy constructor both call a reset() function (among other things). + ~StorageWrapper() { delete parts; } + StorageWrapper(StorageWrapper const&) = default; + + // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find. + void operator=(StorageWrapper&& other) { delete parts; } + + // Not provided, typically would do `parts = new long`. + StorageWrapper(); + + long* parts; +}; + +void test_non_trivial_struct_assignment() { + StorageWrapper* object = new StorageWrapper[]{StorageWrapper()}; + // object[0] = StorageWrapper(); // Remove the source of double free to make the potential leak appear. +} // leak-warning{{Potential leak of memory pointed to by 'object'}} +} // mutually_exclusive_test_case_2 + +namespace mutually_exclusive_test_case_3 { +struct StorageWrapper { + // Imagine the destructor and copy constructor both call a reset() function (among other things). + ~StorageWrapper() { delete parts; } + StorageWrapper(StorageWrapper const&) = default; + + // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find. + void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}} + + // Not provided, typically would do `parts = new long`. + StorageWrapper(); + + long* parts; +}; + +struct TestDoubleFreeWithInitializerList { + StorageWrapper* Object; + TestDoubleFreeWithInitializerList() + : Object(new StorageWrapper[]{StorageWrapper()}) { + Object[0] = StorageWrapper(); // This assignment leads to the double-free. + } +}; +} // mutually_exclusive_test_case_3 + +} // namespace gh153782 diff --git a/clang/test/Analysis/ctor-trivial-copy.cpp b/clang/test/Analysis/ctor-trivial-copy.cpp index 940ff9ba3ed9c..44990fc631d6d 100644 --- a/clang/test/Analysis/ctor-trivial-copy.cpp +++ b/clang/test/Analysis/ctor-trivial-copy.cpp @@ -5,8 +5,6 @@ void clang_analyzer_printState(); template void clang_analyzer_dump_lref(T& param); template void clang_analyzer_dump_val(T param); -template void clang_analyzer_denote(T param, const char *name); -template void clang_analyzer_express(T param); template T conjure(); template void nop(const Ts &... args) {} @@ -42,10 +40,16 @@ void test_assign_return() { namespace trivial_struct_copy { void _01_empty_structs() { - clang_analyzer_dump_val(conjure()); // expected-warning {{conj_$}} + clang_analyzer_dump_val(conjure()); // expected-warning {{lazyCompoundVal}} empty Empty = conjure(); empty Empty2 = Empty; empty Empty3 = Empty2; + // All of these should refer to the exact same LCV, because all of + // these trivial copies refer to the original conjured value. + // There were Unknown before: + clang_analyzer_dump_val(Empty); // expected-warning {{lazyCompoundVal}} + clang_analyzer_dump_val(Empty2); // expected-warning {{lazyCompoundVal}} + clang_analyzer_dump_val(Empty3); // expected-warning {{lazyCompoundVal}} // We only have binding for the original Empty object, because copying empty // objects is a no-op in the performTrivialCopy. This is fine, because empty @@ -67,20 +71,18 @@ void _01_empty_structs() { } void _02_structs_with_members() { - clang_analyzer_dump_val(conjure()); // expected-warning {{conj_$}} + clang_analyzer_dump_val(conjure()); // expected-warning {{lazyCompoundVal}} aggr Aggr = conjure(); aggr Aggr2 = Aggr; aggr Aggr3 = Aggr2; - // All of these should refer to the exact same symbol, because all of + // All of these should refer to the exact same LCV, because all of // these trivial copies refer to the original conjured value. - clang_analyzer_denote(Aggr, "$Aggr"); - clang_analyzer_express(Aggr); // expected-warning {{$Aggr}} - clang_analyzer_express(Aggr2); // expected-warning {{$Aggr}} - clang_analyzer_express(Aggr3); // expected-warning {{$Aggr}} - - // We should have the same Conjured symbol for "Aggr", "Aggr2" and "Aggr3". - // We used to have Derived symbols for the individual fields that were - // copied as part of copying the whole struct. + clang_analyzer_dump_val(Aggr); // expected-warning {{lazyCompoundVal}} + clang_analyzer_dump_val(Aggr2); // expected-warning {{lazyCompoundVal}} + clang_analyzer_dump_val(Aggr3); // expected-warning {{lazyCompoundVal}} + + // We have fields in the struct we copy, thus we also have the entries for the copies + // (and for all of their fields). clang_analyzer_printState(); // CHECK: "store": { "pointer": "0x{{[0-9a-f]+}}", "items": [ // CHECK-NEXT: { "cluster": "GlobalInternalSpaceRegion", "pointer": "0x{{[0-9a-f]+}}", "items": [ @@ -93,10 +95,12 @@ void _02_structs_with_members() { // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ:conj_\$[0-9]+{int, LC[0-9]+, S[0-9]+, #[0-9]+}]]" } // CHECK-NEXT: ]}, // CHECK-NEXT: { "cluster": "Aggr2", "pointer": "0x{{[0-9a-f]+}}", "items": [ - // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" } + // CHECK-NEXT: { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" }, + // CHECK-NEXT: { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" } // CHECK-NEXT: ]}, // CHECK-NEXT: { "cluster": "Aggr3", "pointer": "0x{{[0-9a-f]+}}", "items": [ - // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" } + // CHECK-NEXT: { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" }, + // CHECK-NEXT: { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" } // CHECK-NEXT: ]} // CHECK-NEXT: ]}, @@ -113,3 +117,31 @@ void entrypoint() { } } // namespace trivial_struct_copy + +namespace gh153782 { + +// Ensure we do not regress on the following use cases. +// The assumption made on a field in `setPtr` should apply to the returned copy in `func`. +struct Status { int error; }; +Status getError(); + +Status setPtr(int **outptr, int* ptr) { + Status e = getError(); + if (e.error != 0) return e; // When assuming the error field is non-zero, + *outptr = ptr; // this is not executed + return e; +} + +int func() { + int *ptr = nullptr; + int x = 42; + if (setPtr(&ptr, &x).error == 0) { + // The assumption made in get() SHOULD match the assumption about + // the returned value, hence the engine SHOULD NOT assume ptr is null. + clang_analyzer_dump_val(ptr); // expected-warning {{&x}} + return *ptr; + } + return 0; +} + +} // namespace gh153782 diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp index dfc650223c9e7..9474aa7c7dbb1 100644 --- a/clang/test/Analysis/explain-svals.cpp +++ b/clang/test/Analysis/explain-svals.cpp @@ -99,7 +99,7 @@ class C { } // end of anonymous namespace void test_6() { - clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \+0\)'$}}}} + clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^lazily frozen compound value of 1st parameter of function 'clang_analyzer_explain\(\)'$}}}} clang_analyzer_explain(conjure_S().z); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \)'\) for field 'z' of temporary object constructed at statement 'conjure_S\(\)'$}}}} } diff --git a/clang/test/Analysis/iterator-modeling.cpp b/clang/test/Analysis/iterator-modeling.cpp index 78882da4431fd..f1538839d06c8 100644 --- a/clang/test/Analysis/iterator-modeling.cpp +++ b/clang/test/Analysis/iterator-modeling.cpp @@ -2035,7 +2035,6 @@ void print_state(std::vector &V) { // CHECK: "checker_messages": [ // CHECK: { "checker": "alpha.cplusplus.IteratorModeling", "messages": [ // CHECK-NEXT: "Iterator Positions :", - // CHECK-NEXT: "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]] & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}", // CHECK-NEXT: "i0 : Valid ; Container == SymRegion{reg_$[[#]] & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}" // CHECK-NEXT: ]} @@ -2046,7 +2045,6 @@ void print_state(std::vector &V) { // CHECK: "checker_messages": [ // CHECK: { "checker": "alpha.cplusplus.IteratorModeling", "messages": [ // CHECK-NEXT: "Iterator Positions :", - // CHECK-NEXT: "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]] & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}", // CHECK-NEXT: "i1 : Valid ; Container == SymRegion{reg_$[[#]] & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}" // CHECK-NEXT: ]} diff --git a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp index 191af95cd2b9c..98301cf7274fc 100644 --- a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp +++ b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp @@ -4,16 +4,6 @@ // RUN: -analyzer-config alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling=true\ // RUN: -verify -// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because -// these tests started failing after we just directly copy the symbol -// representing the value of a variable instead of creating a LazyCompoundVal -// of that single conjured value. -// In theory, it shouldn't matter if we eagerly copy the value that we would -// "load" from the LCV once requested or just directly binding the backing symbol. -// Yet, these tests fail, so there is likely messed up how/what the checker -// metadata is associated with. -// XFAIL: * - #include "Inputs/system-header-simulator-cxx.h" void clang_analyzer_eval(bool); diff --git a/clang/test/Analysis/stl-algorithm-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling.cpp index f7029c79b0942..5549c24a8c220 100644 --- a/clang/test/Analysis/stl-algorithm-modeling.cpp +++ b/clang/test/Analysis/stl-algorithm-modeling.cpp @@ -3,16 +3,6 @@ // RUN: -analyzer-config aggressive-binary-operation-simplification=true\ // RUN: -verify -// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because -// these tests started failing after we just directly copy the symbol -// representing the value of a variable instead of creating a LazyCompoundVal -// of that single conjured value. -// In theory, it shouldn't matter if we eagerly copy the value that we would -// "load" from the LCV once requested or just directly binding the backing symbol. -// Yet, these tests fail, so there is likely messed up how/what the checker -// metadata is associated with. -// XFAIL: * - #include "Inputs/system-header-simulator-cxx.h" void clang_analyzer_eval(bool); diff --git a/clang/test/Analysis/store-dump-orders.cpp b/clang/test/Analysis/store-dump-orders.cpp index dbe93f1c5183a..d99f581f00fe1 100644 --- a/clang/test/Analysis/store-dump-orders.cpp +++ b/clang/test/Analysis/store-dump-orders.cpp @@ -41,7 +41,7 @@ void test_output(int n) { // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "conj_$ // CHECK-NEXT: ]}, // CHECK-NEXT: { "cluster": "objfirst", "pointer": "0x{{[0-9a-f]+}}", "items": [ - // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "conj_$ + // CHECK-NEXT: { "kind": "Default", "offset": 0, "value": "lazyCompoundVal // CHECK-NEXT: { "kind": "Direct", "offset": 320, "value": "1 S32b" }, // CHECK-NEXT: { "kind": "Direct", "offset": 352, "value": "2 S32b" }, // CHECK-NEXT: { "kind": "Direct", "offset": 384, "value": "3 S32b" } diff --git a/clang/test/Analysis/taint-generic.cpp b/clang/test/Analysis/taint-generic.cpp index fc7c37300d3fc..4b8d9ab68ff84 100644 --- a/clang/test/Analysis/taint-generic.cpp +++ b/clang/test/Analysis/taint-generic.cpp @@ -158,7 +158,11 @@ void top() { clang_analyzer_isTainted(E); // expected-warning {{NO}} Aggr A = mySource1(); - clang_analyzer_isTainted(A); // expected-warning {{YES}} + // FIXME Ideally, both A and A.data should be tainted. However, the + // implementation used by e5ac9145ba29 ([analyzer][taint] Recognize + // tainted LazyCompoundVals (4/4) (#115919), 2024-11-15) led to FPs and + // FNs in various scenarios and had to be reverted to fix #153782. + clang_analyzer_isTainted(A); // expected-warning {{NO}} clang_analyzer_isTainted(A.data); // expected-warning {{YES}} } } // namespace gh114270 diff --git a/clang/test/Analysis/template-param-objects.cpp b/clang/test/Analysis/template-param-objects.cpp index b065f8756d4d8..dde95fa62cb65 100644 --- a/clang/test/Analysis/template-param-objects.cpp +++ b/clang/test/Analysis/template-param-objects.cpp @@ -11,7 +11,7 @@ bool operator ==(Box lhs, Box rhs) { return lhs.value == rhs.value; } template void dumps() { - clang_analyzer_dump(V); // expected-warning {{Unknown}} + clang_analyzer_dump(V); // expected-warning {{lazyCompoundVal}} clang_analyzer_dump(&V); // expected-warning {{Unknown}} clang_analyzer_dump(V.value); // expected-warning {{Unknown}} FIXME: It should be '6 S32b'. clang_analyzer_dump(&V.value); // expected-warning {{Unknown}} diff --git a/clang/test/CIR/CodeGen/atomic.c b/clang/test/CIR/CodeGen/atomic.c index 65799881a0cbe..d5bea8446d730 100644 --- a/clang/test/CIR/CodeGen/atomic.c +++ b/clang/test/CIR/CodeGen/atomic.c @@ -46,6 +46,32 @@ void f2(void) { // OGCG-NEXT: store i32 42, ptr %[[SLOT]], align 4 // OGCG: } +void f3(_Atomic(int) *p) { + *p = 42; +} + +// CIR-LABEL: @f3 +// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !s32i, !cir.ptr + +// LLVM-LABEL: @f3 +// LLVM: store atomic i32 42, ptr %{{.+}} seq_cst, align 4 + +// OGCG-LABEL: @f3 +// OGCG: store atomic i32 42, ptr %{{.+}} seq_cst, align 4 + +void f4(_Atomic(float) *p) { + *p = 3.14; +} + +// CIR-LABEL: @f4 +// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !cir.float, !cir.ptr + +// LLVM-LABEL: @f4 +// LLVM: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4 + +// OGCG-LABEL: @f4 +// OGCG: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4 + void load(int *ptr) { int x; __atomic_load(ptr, &x, __ATOMIC_RELAXED); diff --git a/clang/test/CIR/CodeGen/binassign.c b/clang/test/CIR/CodeGen/binassign.c index 44c54b4a2969a..4520063c56ee6 100644 --- a/clang/test/CIR/CodeGen/binassign.c +++ b/clang/test/CIR/CodeGen/binassign.c @@ -100,3 +100,107 @@ void binary_assign_struct() { // OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LS_PTR]], ptr align 4 @gs, i64 8, i1 false) // OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LSV_PTR]], ptr align 4 @gsv, i64 8, i1 true) // OGCG: ret void + +int ignore_result_assign() { + int arr[10]; + int i, j; + j = i = 123, 0; + j = arr[i = 5]; + int *p, *q = 0; + if(p = q) + return 1; + return 0; +} + +// CIR-LABEL: cir.func{{.*}} @ignore_result_assign() -> !s32i +// CIR: %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr, ["__retval"] +// CIR: %[[ARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["arr"] +// CIR: %[[I:.*]] = cir.alloca !s32i, !cir.ptr, ["i"] +// CIR: %[[J:.*]] = cir.alloca !s32i, !cir.ptr, ["j"] +// CIR: %[[P:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["p"] +// CIR: %[[Q:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["q", init] +// CIR: %[[VAL_123:.*]] = cir.const #cir.int<123> : !s32i +// CIR: cir.store{{.*}} %[[VAL_123]], %[[I]] : !s32i, !cir.ptr +// CIR: cir.store{{.*}} %[[VAL_123]], %[[J]] : !s32i, !cir.ptr +// CIR: %[[VAL_0:.*]] = cir.const #cir.int<0> : !s32i +// CIR: %[[VAL_5:.*]] = cir.const #cir.int<5> : !s32i +// CIR: cir.store{{.*}} %[[VAL_5]], %[[I]] : !s32i, !cir.ptr +// CIR: %[[ARR_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr> -> !cir.ptr +// CIR: %[[ARR_ELEM:.*]] = cir.ptr_stride %[[ARR_DECAY]], %[[VAL_5]] : (!cir.ptr, !s32i) -> !cir.ptr +// CIR: %[[ARR_LOAD:.*]] = cir.load{{.*}} %[[ARR_ELEM]] : !cir.ptr, !s32i +// CIR: cir.store{{.*}} %[[ARR_LOAD]], %[[J]] : !s32i, !cir.ptr +// CIR: %[[NULL:.*]] = cir.const #cir.ptr : !cir.ptr +// CIR: cir.store{{.*}} %[[NULL]], %[[Q]] : !cir.ptr, !cir.ptr> +// CIR: cir.scope { +// CIR: %[[Q_VAL:.*]] = cir.load{{.*}} %[[Q]] : !cir.ptr>, !cir.ptr +// CIR: cir.store{{.*}} %[[Q_VAL]], %[[P]] : !cir.ptr, !cir.ptr> +// CIR: %[[COND:.*]] = cir.cast ptr_to_bool %[[Q_VAL]] : !cir.ptr -> !cir.bool +// CIR: cir.if %[[COND]] { +// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i +// CIR: cir.store %[[ONE]], %[[RETVAL]] : !s32i, !cir.ptr +// CIR: %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr, !s32i +// CIR: cir.return +// CIR: } +// CIR: } +// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i +// CIR: cir.store %[[ZERO]], %[[RETVAL]] : !s32i, !cir.ptr +// CIR: %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr, !s32i +// CIR: cir.return + +// LLVM-LABEL: define {{.*}}i32 @ignore_result_assign() +// LLVM: %[[RETVAL_PTR:.*]] = alloca i32 +// LLVM: %[[ARR_PTR:.*]] = alloca [10 x i32] +// LLVM: %[[I_PTR:.*]] = alloca i32 +// LLVM: %[[J_PTR:.*]] = alloca i32 +// LLVM: %[[P_PTR:.*]] = alloca ptr +// LLVM: %[[Q_PTR:.*]] = alloca ptr +// LLVM: store i32 123, ptr %[[I_PTR]] +// LLVM: store i32 123, ptr %[[J_PTR]] +// LLVM: store i32 5, ptr %[[I_PTR]] +// LLVM: %[[GEP1:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0 +// LLVM: %[[GEP2:.*]] = getelementptr i32, ptr %[[GEP1]], i64 5 +// LLVM: %[[ARR_VAL:.*]] = load i32, ptr %[[GEP2]] +// LLVM: store i32 %[[ARR_VAL]], ptr %[[J_PTR]] +// LLVM: store ptr null, ptr %[[Q_PTR]] +// LLVM: br label +// LLVM: %[[Q_VAL:.*]] = load ptr, ptr %[[Q_PTR]] +// LLVM: store ptr %[[Q_VAL]], ptr %[[P_PTR]] +// LLVM: %[[CMP:.*]] = icmp ne ptr %[[Q_VAL]], null +// LLVM: br i1 %[[CMP]], label %[[THEN:.*]], label %[[ELSE:.*]] +// LLVM: [[THEN]]: +// LLVM: store i32 1, ptr %[[RETVAL_PTR]] +// LLVM: %{{.*}} = load i32, ptr %[[RETVAL_PTR]] +// LLVM: ret i32 +// LLVM: [[ELSE]]: +// LLVM: br label +// LLVM: store i32 0, ptr %[[RETVAL_PTR]] +// LLVM: %{{.*}} = load i32, ptr %[[RETVAL_PTR]] +// LLVM: ret i32 + +// OGCG-LABEL: define {{.*}}i32 @ignore_result_assign() +// OGCG: %[[RETVAL:.*]] = alloca i32 +// OGCG: %[[ARR:.*]] = alloca [10 x i32] +// OGCG: %[[I:.*]] = alloca i32 +// OGCG: %[[J:.*]] = alloca i32 +// OGCG: %[[P:.*]] = alloca ptr +// OGCG: %[[Q:.*]] = alloca ptr +// OGCG: store i32 123, ptr %[[I]] +// OGCG: store i32 123, ptr %[[J]] +// OGCG: store i32 5, ptr %[[I]] +// OGCG: %[[ARRAYIDX:.*]] = getelementptr inbounds [10 x i32], ptr %[[ARR]], i64 0, i64 5 +// OGCG: %[[ARR_VAL:.*]] = load i32, ptr %[[ARRAYIDX]] +// OGCG: store i32 %[[ARR_VAL]], ptr %[[J]] +// OGCG: store ptr null, ptr %[[Q]] +// OGCG: %[[Q_VAL:.*]] = load ptr, ptr %[[Q]] +// OGCG: store ptr %[[Q_VAL]], ptr %[[P]] +// OGCG: %[[TOBOOL:.*]] = icmp ne ptr %[[Q_VAL]], null +// OGCG: br i1 %[[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +// OGCG: [[IF_THEN]]: +// OGCG: store i32 1, ptr %[[RETVAL]] +// OGCG: br label %[[RETURN:.*]] +// OGCG: [[IF_END]]: +// OGCG: store i32 0, ptr %[[RETVAL]] +// OGCG: br label %[[RETURN]] +// OGCG: [[RETURN]]: +// OGCG: %{{.*}} = load i32, ptr %[[RETVAL]] +// OGCG: ret i32 diff --git a/clang/test/CIR/CodeGen/builtins-floating-point.c b/clang/test/CIR/CodeGen/builtins-floating-point.c index 193cc172d37d2..1b7de650662c7 100644 --- a/clang/test/CIR/CodeGen/builtins-floating-point.c +++ b/clang/test/CIR/CodeGen/builtins-floating-point.c @@ -7,14 +7,42 @@ float cosf(float f) { return __builtin_cosf(f); - // CHECK: %{{.*}} = cir.cos {{.*}} : !cir.float + // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.float // LLVM: %{{.*}} = call float @llvm.cos.f32(float %{{.*}}) // OGCG: %{{.*}} = call float @llvm.cos.f32(float %{{.*}}) } double cos(double f) { return __builtin_cos(f); - // CIR: {{.+}} = cir.cos {{.+}} : !cir.double + // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.double // LLVM: %{{.*}} = call double @llvm.cos.f64(double %{{.*}}) // OGCG: %{{.*}} = call double @llvm.cos.f64(double %{{.*}}) } + +float ceil(float f) { + return __builtin_ceilf(f); + // CIR: %{{.*}} = cir.ceil %{{.*}} : !cir.float + // LLVM: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}}) + // OGCG: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}}) +} + +float expf(float f) { + return __builtin_expf(f); + // CIR: %{{.*}} = cir.exp {{.*}} : !cir.float + // LLVM: %{{.*}} = call float @llvm.exp.f32(float %{{.*}}) + // OGCG: %{{.*}} = call float @llvm.exp.f32(float %{{.*}}) +} + +double exp(double f) { + return __builtin_exp(f); + // CIR: %{{.*}} = cir.exp {{.*}} : !cir.double + // LLVM: %{{.*}} = call double @llvm.exp.f64(double %{{.*}}) + // OGCG: %{{.*}} = call double @llvm.exp.f64(double %{{.*}}) +} + +long double expl(long double f) { + return __builtin_expl(f); + // CIR: %{{.*}} = cir.exp {{.*}} : !cir.long_double + // LLVM: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}}) + // OGCG: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}}) +} diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp index c8db71498e477..ee543001025e7 100644 --- a/clang/test/CIR/CodeGen/struct.cpp +++ b/clang/test/CIR/CodeGen/struct.cpp @@ -344,3 +344,47 @@ void struct_with_const_member_expr() { // OGCG: %[[BF_SET:.*]] = or i8 %[[BF_CLEAR]], 0 // OGCG: store i8 %[[BF_SET]], ptr %[[REF_ADDR]], align 4 // OGCG: store i32 0, ptr %[[A_ADDR]], align 4 + +void function_arg_with_default_value(CompleteS a = {1, 2}) {} + +// CIR: %[[ARG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr, ["a", init] +// CIR: cir.store %{{.*}}, %[[ARG_ADDR]] : !rec_CompleteS, !cir.ptr + +// LLVM: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4 +// LLVM: store %struct.CompleteS %{{.*}}, ptr %[[ARG_ADDR]], align 4 + +// OGCG: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, align 4 +// OGCG: store i64 %{{.*}}, ptr %[[ARG_ADDR]], align 4 + +void calling_function_with_default_values() { + function_arg_with_default_value(); +} + +// CIR: %[[AGG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr, ["agg.tmp0"] +// CIR: %[[ELEM_0_PTR:.*]] = cir.get_member %[[AGG_ADDR]][0] {name = "a"} : !cir.ptr -> !cir.ptr +// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i +// CIR: cir.store{{.*}} %[[CONST_1]], %[[ELEM_0_PTR]] : !s32i, !cir.ptr +// CIR: %[[ELEM_1_PTR:.*]] = cir.get_member %[[AGG_ADDR]][1] {name = "b"} : !cir.ptr -> !cir.ptr +// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i +// CIR: %[[CONST_2_I8:.*]] = cir.cast integral %[[CONST_2]] : !s32i -> !s8i +// CIR: cir.store{{.*}} %[[CONST_2_I8]], %[[ELEM_1_PTR]] : !s8i, !cir.ptr +// CIR: %[[TMP_AGG:.*]] = cir.load{{.*}} %[[AGG_ADDR]] : !cir.ptr, !rec_CompleteS +// CIR: cir.call @_Z31function_arg_with_default_value9CompleteS(%[[TMP_AGG]]) : (!rec_CompleteS) -> () + +// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering, + +// LLVM: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4 +// LLVM: %[[ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0 +// LLVM: store i32 1, ptr %[[ELEM_0_PTR]], align 4 +// LLVM: %[[ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1 +// LLVM: store i8 2, ptr %[[ELEM_1_PTR]], align 4 +// LLVM: %[[TMP_AGG:.*]] = load %struct.CompleteS, ptr %[[AGG_ADDR]], align 4 +// LLVM: call void @_Z31function_arg_with_default_value9CompleteS(%struct.CompleteS %[[TMP_AGG]]) + +// OGCG: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, align 4 +// OGCG: %[[ELEM_0_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0 +// OGCG: store i32 1, ptr %[[ELEM_0_PTR]], align 4 +// OGCG: %[[ELEM_1_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1 +// OGCG: store i8 2, ptr %[[ELEM_1_PTR]], align 4 +// OGCG: %[[TMP_AGG:.*]] = load i64, ptr %[[AGG_ADDR]], align 4 +// OGCG: call void @_Z31function_arg_with_default_value9CompleteS(i64 %[[TMP_AGG]]) diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c new file mode 100644 index 0000000000000..a0d5845208529 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c @@ -0,0 +1,131 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv1f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv32f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f16mf4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f16mf2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f16m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8_m(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c new file mode 100644 index 0000000000000..25d0991fa70cd --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c @@ -0,0 +1,111 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +xsfvfexp32e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv1f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32mf2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f32mf2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f32m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_f32m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8_m(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c new file mode 100644 index 0000000000000..9fc332a1469ff --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c @@ -0,0 +1,135 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv1bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv32bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2, + size_t vl) { + return __riscv_sf_vfexp_v_bf16m8_m(vm, vs2, vl); +} + diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c new file mode 100644 index 0000000000000..67a9220bd011d --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c @@ -0,0 +1,234 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexpa -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv16f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv32f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv16f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f16m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f32m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f32m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8_m(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c new file mode 100644 index 0000000000000..fd6f82db52953 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c @@ -0,0 +1,90 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \ +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f64.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m1(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f64.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m2(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f64.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m4(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f64.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f64m1_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f64m2_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_v_f64m4_m(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8_m(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c new file mode 100644 index 0000000000000..0e769ed5fc5bc --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c @@ -0,0 +1,131 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv1f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv32f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c new file mode 100644 index 0000000000000..3df1eaa3a0467 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c @@ -0,0 +1,111 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +xsfvfexp32e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv1f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c new file mode 100644 index 0000000000000..6179dbe8d82e4 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c @@ -0,0 +1,134 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv1bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv32bf16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2, + size_t vl) { + return __riscv_sf_vfexp(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c new file mode 100644 index 0000000000000..1ddbb0b84520c --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c @@ -0,0 +1,234 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexpa -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv16f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv32f16.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv16f32.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c new file mode 100644 index 0000000000000..165879a8bb589 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c @@ -0,0 +1,90 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \ +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f64.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f64.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f64.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8( +// CHECK-RV64-SAME: [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f64.i64( poison, [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8_m( +// CHECK-RV64-SAME: [[VM:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( poison, [[VS2]], [[VM]], i64 [[VL]], i64 3) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa(vm, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c new file mode 100644 index 0000000000000..aed6d87a4b18a --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c @@ -0,0 +1,248 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv1f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv32f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16mf2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f16m8_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c new file mode 100644 index 0000000000000..374f324cc0808 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c @@ -0,0 +1,208 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +xsfvfexp32e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv1f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32mf2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32mf2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32mf2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32mf2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_f32m8_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c new file mode 100644 index 0000000000000..aec0b9f934ab9 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c @@ -0,0 +1,248 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv1bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv32bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16mf2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_v_bf16m8_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c new file mode 100644 index 0000000000000..b6870264251cc --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c @@ -0,0 +1,448 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexpa -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv16f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv32f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv16f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16mf2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f16m8_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32mf2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f32m8_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c new file mode 100644 index 0000000000000..8638dc232cf01 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c @@ -0,0 +1,167 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \ +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f64.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m1_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f64.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m2_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f64.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m4_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f64.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m1_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m2_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m4_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m1_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m2_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m4_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m1_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m2_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m4_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_v_f64m8_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c new file mode 100644 index 0000000000000..4ceeb7b35629c --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c @@ -0,0 +1,261 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv32f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16mf2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f16m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c new file mode 100644 index 0000000000000..e08d6c5b371cc --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c @@ -0,0 +1,228 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +xsfvfexp32e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv1f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32mf2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_f32m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c new file mode 100644 index 0000000000000..14570d465bea8 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c @@ -0,0 +1,272 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \ +// RUN: -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv1bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv2bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv4bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv8bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv16bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.nxv32bf16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2, + size_t vl) { + return __riscv_sf_vfexp_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, + vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, + vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, + vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, + vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, + vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, + vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, + vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, + vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, + vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, + vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, + vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, + vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, + vbfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16mf2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, + vbfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, + vbfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, + vbfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, + vbfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexp_v_bf16m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: [[ENTRY:.*:]] +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, + vbfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexp_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c new file mode 100644 index 0000000000000..4ac5cfc360551 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c @@ -0,0 +1,492 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \ +// RUN: -target-feature +xsfvfexpa -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv16f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv32f16.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv16f32.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, + vfloat16mf4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16mf2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, + vfloat16mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, + vfloat16m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, + vfloat16m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, + vfloat16m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f16m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, + vfloat16m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32mf2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, + vfloat32mf2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, + vfloat32m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, + vfloat32m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, + vfloat32m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f32m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, + vfloat32m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c new file mode 100644 index 0000000000000..d0faaee571122 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c @@ -0,0 +1,183 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \ +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-RV64 %s + +#include + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv1f64.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv2f64.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv4f64.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8_tu( +// CHECK-RV64-SAME: [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.nxv8f64.i64( [[VD]], [[VS2]], i64 [[VL]]) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2, + size_t vl) { + return __riscv_sf_vfexpa_tu(vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd, + vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd, + vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd, + vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8_tum( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 2) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd, + vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd, + vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd, + vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd, + vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8_tumu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 0) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd, + vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m1_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd, + vfloat64m1_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m2_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd, + vfloat64m2_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m4_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd, + vfloat64m4_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} + +// CHECK-RV64-LABEL: define dso_local @test_sf_vfexpa_v_f64m8_mu( +// CHECK-RV64-SAME: [[VM:%.*]], [[VD:%.*]], [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] { +// CHECK-RV64-NEXT: entry: +// CHECK-RV64-NEXT: [[TMP0:%.*]] = call @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64( [[VD]], [[VS2]], [[VM]], i64 [[VL]], i64 1) +// CHECK-RV64-NEXT: ret [[TMP0]] +// +vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd, + vfloat64m8_t vs2, size_t vl) { + return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl); +} diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index be2cd480f7558..e6e2e38bcc097 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -1591,6 +1591,132 @@ __m512i test_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i return _mm512_maskz_permutex2var_epi16(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v32hi( + _mm512_permutex2var_epi16( + (__m512i)(__v32hi){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150, + 160, 170, 180, 190, 200, 210, 220, 230, + 240, 250, 260, 270, 280, 290, 300, 310}, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + 0, 400, 10, 410, 20, 420, 30, 430, + 40, 440, 50, 450, 60, 460, 70, 470, + 80, 480, 90, 490, 100, 500, 110, 510, + 120, 520, 130, 530, 140, 540, 150, 550)); +TEST_CONSTEXPR(match_v32hi( + _mm512_mask_permutex2var_epi16( + (__m512i)(__v32hi){ + -1, -2, -3, -4, -5, -6, -7, -8, + -9, -10, -11, -12, -13, -14, -15, -16, + -17, -18, -19, -20, -21, -22, -23, -24, + -25, -26, -27, -28, -29, -30, -31, -32}, + 0xAAAAAAAA, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + -1, 400, -3, 410, -5, 420, -7, 430, + -9, 440, -11, 450, -13, 460, -15, 470, + -17, 480, -19, 490, -21, 500, -23, 510, + -25, 520, -27, 530, -29, 540, -31, 550)); +TEST_CONSTEXPR(match_v32hi( + _mm512_maskz_permutex2var_epi16( + 0x55555555, + (__m512i)(__v32hi){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150, + 160, 170, 180, 190, 200, 210, 220, 230, + 240, 250, 260, 270, 280, 290, 300, 310}, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + 0, 0, 10, 0, 20, 0, 30, 0, + 40, 0, 50, 0, 60, 0, 70, 0, + 80, 0, 90, 0, 100, 0, 110, 0, + 120, 0, 130, 0, 140, 0, 150, 0)); + +TEST_CONSTEXPR(match_v64qu( + _mm512_permutex2var_epi8( + (__m512i)(__v64qu){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 127, 126, 125, + 124, 123, 122, 121, 120, 119, 118, 117, + 116, 115, 114, 113, 112, 111, 110, 109, + 108, 107, 106, 105, 104, 103, 102, 101, + 100, 99, 98, 97, 96, 95, 94, 93, + 92, 91, 90, 89, 88, 87, 86, 85, + 84, 83, 82, 81, 80, 79, 78, 77}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 210, 220, 230, 240, 250, 254, 253, + 252, 251, 250, 249, 248, 247, 246, 245, + 244, 243, 242, 241, 240, 239, 238, 237, + 236, 235, 234, 233, 232, 231, 230, 229, + 228, 227, 226, 225, 224, 223, 222, 221, + 220, 219, 218, 217, 216, 215, 214, 213, + 212, 211, 210, 209, 208, 207, 206, 205, + 204, 203, 202, 201, 200, 199, 198, 197}), + 0, 200, 10, 210, 20, 220, 30, 230, + 40, 240, 50, 250, 60, 254, 70, 253, + 80, 252, 90, 251, 100, 250, 110, 249, + 120, 248, 127, 247, 126, 246, 125, 245, + 124, 244, 123, 243, 122, 242, 121, 241, + 120, 240, 119, 239, 118, 238, 117, 237, + 116, 236, 115, 235, 114, 234, 113, 233, + 112, 232, 111, 231, 110, 230, 109, 229)); +TEST_CONSTEXPR(match_v32hi( + _mm512_mask2_permutex2var_epi16( + (__m512i)(__v32hi){ + 0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150, + 160, 170, 180, 190, 200, 210, 220, 230, + 240, 250, 260, 270, 280, 290, 300, 310}, + (__m512i)(__v32hi){ + 0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + 0x55555555, + (__m512i)(__v32hi){ + 400, 410, 420, 430, 440, 450, 460, 470, + 480, 490, 500, 510, 520, 530, 540, 550, + 560, 570, 580, 590, 600, 610, 620, 630, + 640, 650, 660, 670, 680, 690, 700, 710}), + 0, 32, 10, 33, 20, 34, 30, 35, + 40, 36, 50, 37, 60, 38, 70, 39, + 80, 40, 90, 41, 100, 42, 110, 43, + 120, 44, 130, 45, 140, 46, 150, 47)); + __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mulhrs_epi16 // CHECK: @llvm.x86.avx512.pmul.hr.sw.512 @@ -2578,6 +2704,33 @@ __m512i test_mm512_broadcastw_epi16(__m128i __A) { return _mm512_broadcastw_epi16(__A); } TEST_CONSTEXPR(match_v32hi(_mm512_broadcastw_epi16((__m128i)(__v8hi){42, 3, 10, 8, 0, 256, 256, 128}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42)); +TEST_CONSTEXPR(match_v32hi( + _mm512_permutex2var_epi16((__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}, + (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34, + 3, 35, 4, 36, 5, 37, 6, 38, + 7, 39, 8, 40, 9, 41, 10, 42, + 11, 43, 12, 44, 13, 45, 14, 46}, + (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}), + 1, 32, 101, 132, 2, 102, 3, 103, + 4, 104, 5, 105, 6, 106, 7, 107, + 8, 108, 9, 109, 10, 110, 11, 111, + 12, 112, 13, 113, 14, 114, 15, 115)); +TEST_CONSTEXPR(match_v32hi( + _mm512_mask_permutex2var_epi16((__m512i)(__v32hi){-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, + -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32}, + 0xAAAAAAAA, + (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34, + 3, 35, 4, 36, 5, 37, 6, 38, + 7, 39, 8, 40, 9, 41, 10, 42, + 11, 43, 12, 44, 13, 45, 14, 46}, + (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}), + -1, -32, -3, 132, -5, 102, -7, 103, + -9, 104, -11, 105, -13, 106, -15, 107, + -17, 108, -19, 109, -21, 110, -23, 111, + -25, 112, -27, 113, -29, 114, -31, 115)); __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) { // CHECK-LABEL: test_mm512_mask_broadcastw_epi16 diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 69599379b6b3d..8e65430bd3e84 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -5607,6 +5607,56 @@ __m512i test_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i _ // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_permutex2var_epi64(__U, __A, __I, __B); } + +TEST_CONSTEXPR(match_v16si( + _mm512_permutex2var_epi32( + (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 150, 200, 350, 10, 210, 20, 220, + 30, 230, 40, 240, 50, 250, 60, 260)); +TEST_CONSTEXPR(match_v16si( + _mm512_mask_permutex2var_epi32( + (__m512i)(__v16si){-1, -2, -3, -4, -5, -6, -7, -8, + -9, -10, -11, -12, -13, -14, -15, -16}, + 0xAAAA, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + -1, -16, -3, 350, -5, 210, -7, 220, + -9, 230, -11, 240, -13, 250, -15, 260)); +TEST_CONSTEXPR(match_v16si( + _mm512_maskz_permutex2var_epi32( + 0x5555, + (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 0, 200, 0, 10, 0, 20, 0, + 30, 0, 40, 0, 50, 0, 60, 0)); +TEST_CONSTEXPR(match_m512( + _mm512_permutex2var_ps( + (__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, + 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f, + 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}), + 1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f, + 4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f)); +TEST_CONSTEXPR(match_m512d( + _mm512_permutex2var_pd( + (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + 1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0)); + __mmask16 test_mm512_testn_epi32_mask(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_testn_epi32_mask // CHECK: and <16 x i32> %{{.*}}, %{{.*}} @@ -11753,3 +11803,73 @@ void test_mm512_mask_i32loscatter_epi64(void *__addr, __mmask8 __mask, __m512i _ // CHECK: @llvm.x86.avx512.mask.scatter.dpq.512 _mm512_mask_i32loscatter_epi64(__addr, __mask, __index, __v1, 2); } + + +TEST_CONSTEXPR(match_m512d( + _mm512_permutex2var_pd((__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + 1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0)); +TEST_CONSTEXPR(match_m512d( + _mm512_mask_permutex2var_pd((__m512d){-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0}, + 0xAA, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + -1.0, 108.0, -3.0, -8.0, -5.0, -2.0, -7.0, -3.0)); +TEST_CONSTEXPR(match_m512d( + _mm512_maskz_permutex2var_pd(0x55, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}), + 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0)); + +TEST_CONSTEXPR(match_m512( + _mm512_permutex2var_ps((__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, + 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f, + 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}), + 1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f, + 4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f)); +TEST_CONSTEXPR(match_m512( + _mm512_mask_permutex2var_ps((__m512){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f, + -9.f, -10.f, -11.f, -12.f, -13.f, -14.f, -15.f, -16.f}, + 0xAAAA, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f, + 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}), + -1.f, -16.f, -3.f, 116.f, -5.f, 102.f, -7.f, 103.f, + -9.f, 104.f, -11.f, 105.f, -13.f, 106.f, -15.f, 107.f)); + +TEST_CONSTEXPR(match_v16si( + _mm512_permutex2var_epi32((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 1, 16, 101, 116, 2, 102, 3, 103, + 4, 104, 5, 105, 6, 106, 7, 107)); +TEST_CONSTEXPR(match_v16si( + _mm512_maskz_permutex2var_epi32(0x5555, + (__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}, + (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 1, 0, 101, 0, 2, 0, 3, 0, + 4, 0, 5, 0, 6, 0, 7, 0)); + +TEST_CONSTEXPR(match_v8di( + _mm512_permutex2var_epi64((__m512i)(__v8di){1, 2, 3, 4, 5, 6, 7, 8}, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}), + 1, 108, 1, 8, 2, 2, 3, 3)); +TEST_CONSTEXPR(match_v8di( + _mm512_mask_permutex2var_epi64((__m512i)(__v8di){-1, -2, -3, -4, -5, -6, -7, -8}, + 0xAA, + (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18}, + (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}), + -1, 108, -3, -8, -5, -2, -7, -3)); diff --git a/clang/test/CodeGen/X86/avx512vbmi-builtins.c b/clang/test/CodeGen/X86/avx512vbmi-builtins.c index c3b6298a39b59..7d506db92faeb 100644 --- a/clang/test/CodeGen/X86/avx512vbmi-builtins.c +++ b/clang/test/CodeGen/X86/avx512vbmi-builtins.c @@ -3,8 +3,14 @@ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + #include +#include "builtin_test_helpers.h" __m512i test_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, __m512i __B) { // CHECK-LABEL: test_mm512_mask2_permutex2var_epi8 @@ -33,6 +39,154 @@ __m512i test_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i _ return _mm512_maskz_permutex2var_epi8(__U, __A, __I, __B); } +TEST_CONSTEXPR(match_v64qu( + _mm512_permutex2var_epi8((__m512i)(__v64qu){ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 0, 200, 1, 201, 2, 202, 3, 203, + 4, 204, 5, 205, 6, 206, 7, 207, + 8, 208, 9, 209, 10, 210, 11, 211, + 12, 212, 13, 213, 14, 214, 15, 215, + 16, 216, 17, 217, 18, 218, 19, 219, + 20, 220, 21, 221, 22, 222, 23, 223, + 24, 224, 25, 225, 26, 226, 27, 227, + 28, 228, 29, 229, 30, 230, 31, 231)); +TEST_CONSTEXPR(match_v64qu( + _mm512_mask_permutex2var_epi8((__m512i)(__v64qu){ + 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, + 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, + 58, 59, 60, 61, 62, 63, 64, 65, + 66, 67, 68, 69, 70, 71, 72, 73}, + 0xAAAAAAAAAAAAAAAAULL, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 10, 200, 12, 201, 14, 202, 16, 203, + 18, 204, 20, 205, 22, 206, 24, 207, + 26, 208, 28, 209, 30, 210, 32, 211, + 34, 212, 36, 213, 38, 214, 40, 215, + 42, 216, 44, 217, 46, 218, 48, 219, + 50, 220, 52, 221, 54, 222, 56, 223, + 58, 224, 60, 225, 62, 226, 64, 227, + 66, 228, 68, 229, 70, 230, 72, 231)); +TEST_CONSTEXPR(match_v64qu( + _mm512_maskz_permutex2var_epi8(0x5555555555555555ULL, + (__m512i)(__v64qu){ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 0, 0, 1, 0, 2, 0, 3, 0, + 4, 0, 5, 0, 6, 0, 7, 0, + 8, 0, 9, 0, 10, 0, 11, 0, + 12, 0, 13, 0, 14, 0, 15, 0, + 16, 0, 17, 0, 18, 0, 19, 0, + 20, 0, 21, 0, 22, 0, 23, 0, + 24, 0, 25, 0, 26, 0, 27, 0, + 28, 0, 29, 0, 30, 0, 31, 0)); +TEST_CONSTEXPR(match_v64qu( + _mm512_mask2_permutex2var_epi8((__m512i)(__v64qu){ + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63}, + (__m512i)(__v64qu){ + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95}, + 0x5555555555555555ULL, + (__m512i)(__v64qu){ + 200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231, + 232, 233, 234, 235, 236, 237, 238, 239, + 240, 241, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 254, 255, + 0, 1, 2, 3, 4, 5, 6, 7}), + 0, 64, 1, 65, 2, 66, 3, 67, + 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, + 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, + 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, + 28, 92, 29, 93, 30, 94, 31, 95)); + __m512i test_mm512_permutexvar_epi8(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_permutexvar_epi8 // CHECK: call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %{{.*}}, <64 x i8> %{{.*}}) diff --git a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c index c4d5fc8fb6977..49b7a1a721195 100644 --- a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c +++ b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c @@ -3,8 +3,14 @@ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + #include +#include "builtin_test_helpers.h" __m128i test_mm_permutexvar_epi8(__m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_permutexvar_epi8 @@ -77,8 +83,28 @@ __m128i test_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, // CHECK-LABEL: test_mm_maskz_permutex2var_epi8 // CHECK: call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} - return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); -} + return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); +} + +TEST_CONSTEXPR(match_v16qu( + _mm_permutex2var_epi8((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}, + (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23}, + (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 1, 101, 2, 102, 3, 103, 4, 104, + 5, 105, 6, 106, 7, 107, 8, 108)); +TEST_CONSTEXPR(match_v16qu( + _mm_mask_permutex2var_epi8((__m128i)(__v16qu){200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215}, + 0xAAAA, + (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23}, + (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116}), + 200, 101, 202, 102, 204, 103, 206, 104, + 208, 105, 210, 106, 212, 107, 214, 108)); __m256i test_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_permutex2var_epi8 @@ -97,8 +123,44 @@ __m256i test_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i _ // CHECK-LABEL: test_mm256_maskz_permutex2var_epi8 // CHECK: call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} - return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); -} + return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); +} + +TEST_CONSTEXPR(match_v32qu( + _mm256_permutex2var_epi8((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32}, + (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, + 125, 126, 127, 128, 129, 130, 131, 132}), + 1, 101, 2, 102, 3, 103, 4, 104, + 5, 105, 6, 106, 7, 107, 8, 108, + 9, 109, 10, 110, 11, 111, 12, 112, + 13, 113, 14, 114, 15, 115, 16, 116)); +TEST_CONSTEXPR(match_v32qu( + _mm256_mask_permutex2var_epi8((__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207, + 208, 209, 210, 211, 212, 213, 214, 215, + 216, 217, 218, 219, 220, 221, 222, 223, + 224, 225, 226, 227, 228, 229, 230, 231}, + 0xAAAAAAAA, + (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, + 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, + 12, 44, 13, 45, 14, 46, 15, 47}, + (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, + 125, 126, 127, 128, 129, 130, 131, 132}), + 200, 101, 202, 102, 204, 103, 206, 104, + 208, 105, 210, 106, 212, 107, 214, 108, + 216, 109, 218, 110, 220, 111, 222, 112, + 224, 113, 226, 114, 228, 115, 230, 116)); __m128i test_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) { // CHECK-LABEL: test_mm_mask_multishift_epi64_epi8 diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 33c43977f72dc..121d5bf8d4adb 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -5610,12 +5610,23 @@ __m128i test_mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask2_permutex2var_epi32(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_mask2_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, 0x05, + (__m128i)(__v4si){100, 200, 300, 400}), + 10, 3, 100, 6)); __m256i test_mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { // CHECK-LABEL: test_mm256_mask2_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask2_permutex2var_epi32(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_mask2_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + 0xA5, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + 0, 7, 100, 15, 1, 110, 2, 120)); __m128d test_mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { // CHECK-LABEL: test_mm_mask2_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 @@ -5646,149 +5657,255 @@ __m128i test_mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask2_permutex2var_epi64(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_mask2_permutex2var_epi64((__m128i)(__v2di){10, 20}, + (__m128i)(__v2di){0, 5}, 0x1, + (__m128i)(__v2di){100, 200}), + 10, 5)); __m256i test_mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) { // CHECK-LABEL: test_mm256_mask2_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask2_permutex2var_epi64(__A,__I,__U,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_mask2_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30}, + (__m256i)(__v4di){0, 1, 4, 5}, 0x5, + (__m256i)(__v4di){100, 110, 120, 130}), + 0, 1, 100, 5)); __m128i test_mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.128 return _mm_permutex2var_epi32(__A,__I,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + 10, 40, 100, 300)); __m128i test_mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_mask_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.128 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_permutex2var_epi32(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + -1, -4, -3, 300)); __m128i test_mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.128 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_permutex2var_epi32(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v4si( + _mm_maskz_permutex2var_epi32(0x0A, (__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + 0, 40, 0, 300)); __m256i test_mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 return _mm256_permutex2var_epi32(__A,__I,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + 0, 70, 100, 170, 10, 110, 20, 120)); __m256i test_mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_permutex2var_epi32(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8}, 0xAA, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + -1, -8, -3, 170, -5, 110, -7, 120)); __m256i test_mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_epi32 // CHECK: @llvm.x86.avx512.vpermi2var.d.256 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_permutex2var_epi32(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v8si( + _mm256_maskz_permutex2var_epi32(0xAA, (__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}), + 0, 70, 0, 170, 0, 110, 0, 120)); __m128d test_mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { // CHECK-LABEL: test_mm_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 return _mm_permutex2var_pd(__A,__I,__B); } +TEST_CONSTEXPR(match_m128d( + _mm_permutex2var_pd((__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}), + 1.0, 10.0)); __m128d test_mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { // CHECK-LABEL: test_mm_mask_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_permutex2var_pd(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m128d( + _mm_mask_permutex2var_pd((__m128d){-1.0, -2.0}, 0x2, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}), + -1.0, 10.0)); __m128d test_mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.128 // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_permutex2var_pd(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m128d( + _mm_maskz_permutex2var_pd(0x2, (__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}), + 0.0, 10.0)); __m256d test_mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { // CHECK-LABEL: test_mm256_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.256 return _mm256_permutex2var_pd(__A,__I,__B); } +TEST_CONSTEXPR(match_m256d( + _mm256_permutex2var_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}), + 1.0, 10.0, 2.0, 20.0)); __m256d test_mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.256 // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_permutex2var_pd(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m256d( + _mm256_mask_permutex2var_pd((__m256d){-1.0, -2.0, -3.0, -4.0}, 0x2, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}), + -1.0, 10.0, -3.0, -4.0)); __m256d test_mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_pd // CHECK: @llvm.x86.avx512.vpermi2var.pd.256 // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_permutex2var_pd(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m256d( + _mm256_maskz_permutex2var_pd(0x2, (__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}), + 0.0, 10.0, 0.0, 0.0)); __m128 test_mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { // CHECK-LABEL: test_mm_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.128 return _mm_permutex2var_ps(__A,__I,__B); } +TEST_CONSTEXPR(match_m128( + _mm_permutex2var_ps((__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}), + 1.f, 4.f, 10.f, 30.f)); __m128 test_mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { // CHECK-LABEL: test_mm_mask_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.128 // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_permutex2var_ps(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m128( + _mm_mask_permutex2var_ps((__m128){-1.f, -2.f, -3.f, -4.f}, 0x0A, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}), + -1.f, -4.f, -3.f, 30.f)); __m128 test_mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.128 // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_permutex2var_ps(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m128( + _mm_maskz_permutex2var_ps(0x0A, (__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}), + 0.f, 4.f, 0.f, 30.f)); __m256 test_mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { // CHECK-LABEL: test_mm256_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.256 return _mm256_permutex2var_ps(__A,__I,__B); } +TEST_CONSTEXPR(match_m256( + _mm256_permutex2var_ps((__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}), + 0.f, 7.f, 10.f, 17.f, 1.f, 11.f, 2.f, 12.f)); __m256 test_mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.256 // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_permutex2var_ps(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_m256( + _mm256_mask_permutex2var_ps((__m256){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f}, 0xAA, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}), + -1.f, -8.f, -3.f, 17.f, -5.f, 11.f, -7.f, 12.f)); __m256 test_mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_ps // CHECK: @llvm.x86.avx512.vpermi2var.ps.256 // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_permutex2var_ps(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_m256( + _mm256_maskz_permutex2var_ps(0xAA, (__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}), + 0.f, 7.f, 0.f, 17.f, 0.f, 11.f, 0.f, 12.f)); __m128i test_mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.128 return _mm_permutex2var_epi64(__A,__I,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_permutex2var_epi64((__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}), + 10, 200)); __m128i test_mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_mask_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.128 // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_permutex2var_epi64(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_mask_permutex2var_epi64((__m128i)(__v2di){-1, -2}, 0x2, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}), + -1, 200)); __m128i test_mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { // CHECK-LABEL: test_mm_maskz_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.128 // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_permutex2var_epi64(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v2di( + _mm_maskz_permutex2var_epi64(0x2, (__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}), + 0, 200)); __m256i test_mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 return _mm256_permutex2var_epi64(__A,__I,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}), + 0, 10, 100, 110)); __m256i test_mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_mask_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_permutex2var_epi64(__A,__U,__I,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_mask_permutex2var_epi64((__m256i)(__v4di){-1, -2, -3, -4}, 0x5, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}), + -1, -2, 100, -4)); __m256i test_mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_permutex2var_epi64 // CHECK: @llvm.x86.avx512.vpermi2var.q.256 // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_permutex2var_epi64(__U,__A,__I,__B); } +TEST_CONSTEXPR(match_v4di( + _mm256_maskz_permutex2var_epi64(0x5, (__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}), + 0, 0, 100, 0)); +TEST_CONSTEXPR(match_v4si( + _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40}, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + 10, 40, 100, 300)); +TEST_CONSTEXPR(match_v4si( + _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A, + (__m128i)(__v4si){0, 3, 4, 6}, + (__m128i)(__v4si){100, 200, 300, 400}), + -1, -4, -3, 300)); __m128i test_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_cvtepi8_epi32 // CHECK: sext <4 x i8> %{{.*}} to <4 x i32> @@ -10472,6 +10589,17 @@ __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) { TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0x33u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,0,0)); TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xAAu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 0,0,0,0, 0,4,0,4)); TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xFFu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,4,4)); +TEST_CONSTEXPR(match_v8si( + _mm256_permutex2var_epi32((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}), + 1, 8, 101, 108, 2, 102, 3, 103)); +TEST_CONSTEXPR(match_v8si( + _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8}, + 0xAA, + (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, + (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}), + -1, -8, -3, 108, -5, 102, -7, 103)); __m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) { // CHECK-LABEL: test_mm_mask_mov_pd diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index febef46458ae9..172a3cb219c8a 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -1887,6 +1887,67 @@ __m256i test_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_permutex2var_epi16(__U,__A,__I,__B); } + +TEST_CONSTEXPR(match_v8hi( + _mm_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70}, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, 160, + 170}), + 0, 70, 100, 170, 10, 110, 20, 120)); +TEST_CONSTEXPR(match_v8hi( + _mm_mask_permutex2var_epi16((__m128i)(__v8hi){-1, -2, -3, -4, -5, -6, -7, -8}, + 0xAA, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, + 160, 170}), + -1, -8, -3, 170, -5, 110, -7, 120)); +TEST_CONSTEXPR(match_v8hi( + _mm_maskz_permutex2var_epi16(0xAA, + (__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70}, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, + 160, 170}), + 0, 70, 0, 170, 0, 110, 0, 120)); +TEST_CONSTEXPR(match_v8hi( + _mm_mask2_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70}, + (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10}, + 0x55, + (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, + 160, 170}), + 0, 7, 100, 15, 10, 9, 20, 10)); +TEST_CONSTEXPR(match_v16hi( + _mm256_permutex2var_epi16( + (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 150, 200, 350, 10, 210, 20, 220, + 30, 230, 40, 240, 50, 250, 60, 260)); +TEST_CONSTEXPR(match_v16hi( + _mm256_mask_permutex2var_epi16( + (__m256i)(__v16hi){-1, -2, -3, -4, -5, -6, -7, -8, + -9, -10, -11, -12, -13, -14, -15, -16}, + 0xAAAA, + (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + -1, -16, -3, 350, -5, 210, -7, 220, + -9, 230, -11, 240, -13, 250, -15, 260)); +TEST_CONSTEXPR(match_v16hi( + _mm256_maskz_permutex2var_epi16( + 0x5555, + (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 130, 140, 150}, + (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18, + 3, 19, 4, 20, 5, 21, 6, 22}, + (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270, + 280, 290, 300, 310, 320, 330, 340, 350}), + 0, 0, 200, 0, 10, 0, 20, 0, + 30, 0, 40, 0, 50, 0, 60, 0)); + __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { // CHECK-LABEL: test_mm_mask_maddubs_epi16 // CHECK: @llvm.x86.ssse3.pmadd.ub.sw @@ -3596,3 +3657,22 @@ void test_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i _ // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.256 _mm256_mask_cvtsepi16_storeu_epi8 ( __P, __M, __A); } + + +TEST_CONSTEXPR(match_v16qu( + _mm_permutex2var_epi8((__m128i)(__v16qu){0, 10, 20, 30, 40, 50, 60, 70, + 80, 90, 100, 110, 120, 127, 126, 125}, + (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19, + 4, 20, 5, 21, 6, 22, 7, 23}, + (__m128i)(__v16qu){100, 110, 120, 130, 140, 150, 160, 170, + 180, 190, 200, 210, 220, 230, 240, 250}), + 0, 100, 10, 110, 20, 120, 30, 130, + 40, 140, 50, 150, 60, 160, 70, 170)); +TEST_CONSTEXPR(match_v32qu( + _mm256_permutex2var_epi8((__m256i)(__v32qu){0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109}, + (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47}, + (__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231}), + 0, 200, 10, 201, 20, 202, 30, 203, + 40, 204, 50, 205, 60, 206, 70, 207, + 80, 208, 90, 209, 100, 210, 110, 211, + 120, 212, 127, 213, 126, 214, 125, 215)); diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c index a56f8ba1ee385..c7cd9ffdd6966 100644 --- a/clang/test/CodeGen/X86/math-builtins.c +++ b/clang/test/CodeGen/X86/math-builtins.c @@ -118,36 +118,36 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_copysign(f,f); __builtin_copysignf(f,f); __builtin_copysignl(f,f); __builtin_copysignf128(f,f); -// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]] -// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]] +// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]] +// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]] __builtin_fabs(f); __builtin_fabsf(f); __builtin_fabsl(f); __builtin_fabsf128(f); -// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_frexp(f,i); __builtin_frexpf(f,i); __builtin_frexpl(f,i); __builtin_frexpf128(f,i); -// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]] __builtin_huge_val(); __builtin_huge_valf(); __builtin_huge_vall(); __builtin_huge_valf128(); @@ -165,10 +165,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_ldexp(f,f); __builtin_ldexpf(f,f); __builtin_ldexpl(f,f); __builtin_ldexpf128(f,f); -// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]] @@ -180,7 +180,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]] -// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] +// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]] // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]] @@ -209,10 +209,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_pow(f,f); __builtin_powf(f,f); __builtin_powl(f,f); __builtin_powf128(f,f); -// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]] @@ -220,12 +220,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_powi(f,f); __builtin_powif(f,f); __builtin_powil(f,f); -// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]] /* math */ __builtin_acos(f); __builtin_acosf(f); __builtin_acosl(f); __builtin_acosf128(f); @@ -307,21 +307,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_ceil(f); __builtin_ceilf(f); __builtin_ceill(f); __builtin_ceilf128(f); -// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_cos(f); __builtin_cosf(f); __builtin_cosl(f); __builtin_cosf128(f); -// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]] @@ -362,10 +362,10 @@ __builtin_erfc(f); __builtin_erfcf(f); __builtin_erfcl(f); __builtin_ __builtin_exp(f); __builtin_expf(f); __builtin_expl(f); __builtin_expf128(f); -// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]] @@ -373,10 +373,10 @@ __builtin_exp(f); __builtin_expf(f); __builtin_expl(f); __builtin_e __builtin_exp2(f); __builtin_exp2f(f); __builtin_exp2l(f); __builtin_exp2f128(f); -// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]] @@ -384,10 +384,10 @@ __builtin_exp2(f); __builtin_exp2f(f); __builtin_exp2l(f); __builtin_ __builtin_exp10(f); __builtin_exp10f(f); __builtin_exp10l(f); __builtin_exp10f128(f); -// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @exp10(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @exp10f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @exp10l(x86_fp80 noundef) [[NOT_READNONE]] @@ -417,22 +417,22 @@ __builtin_fdim(f,f); __builtin_fdimf(f,f); __builtin_fdiml(f,f); __bu __builtin_floor(f); __builtin_floorf(f); __builtin_floorl(f); __builtin_floorf128(f); -// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_fma(f,f,f); __builtin_fmaf(f,f,f); __builtin_fmal(f,f,f); __builtin_fmaf128(f,f,f); __builtin_fmaf16(f,f,f); -// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC]] -// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC2]] +// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]] @@ -454,25 +454,25 @@ __builtin_fma(f,f,f); __builtin_fmaf(f,f,f); __builtin_fmal(f,f,f); __builtin_fmax(f,f); __builtin_fmaxf(f,f); __builtin_fmaxl(f,f); __builtin_fmaxf128(f,f); -// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]] __builtin_fmin(f,f); __builtin_fminf(f,f); __builtin_fminl(f,f); __builtin_fminf128(f,f); -// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]] __builtin_hypot(f,f); __builtin_hypotf(f,f); __builtin_hypotl(f,f); __builtin_hypotf128(f,f); @@ -509,10 +509,10 @@ __builtin_lgamma(f); __builtin_lgammaf(f); __builtin_lgammal(f); __builti __builtin_llrint(f); __builtin_llrintf(f); __builtin_llrintl(f); __builtin_llrintf128(f); -// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]] @@ -520,10 +520,10 @@ __builtin_llrint(f); __builtin_llrintf(f); __builtin_llrintl(f); __builti __builtin_llround(f); __builtin_llroundf(f); __builtin_llroundl(f); __builtin_llroundf128(f); -// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]] @@ -531,10 +531,10 @@ __builtin_llround(f); __builtin_llroundf(f); __builtin_llroundl(f); __built __builtin_log(f); __builtin_logf(f); __builtin_logl(f); __builtin_logf128(f); -// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]] @@ -542,10 +542,10 @@ __builtin_log(f); __builtin_logf(f); __builtin_logl(f); __builtin_l __builtin_log10(f); __builtin_log10f(f); __builtin_log10l(f); __builtin_log10f128(f); -// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]] @@ -564,10 +564,10 @@ __builtin_log1p(f); __builtin_log1pf(f); __builtin_log1pl(f); __builtin __builtin_log2(f); __builtin_log2f(f); __builtin_log2l(f); __builtin_log2f128(f); -// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]] @@ -586,10 +586,10 @@ __builtin_logb(f); __builtin_logbf(f); __builtin_logbl(f); __builtin_ __builtin_lrint(f); __builtin_lrintf(f); __builtin_lrintl(f); __builtin_lrintf128(f); -// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]] @@ -597,10 +597,10 @@ __builtin_lrint(f); __builtin_lrintf(f); __builtin_lrintl(f); __builtin __builtin_lround(f); __builtin_lroundf(f); __builtin_lroundl(f); __builtin_lroundf128(f); -// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]] @@ -608,14 +608,14 @@ __builtin_lround(f); __builtin_lroundf(f); __builtin_lroundl(f); __built __builtin_nearbyint(f); __builtin_nearbyintf(f); __builtin_nearbyintl(f); __builtin_nearbyintf128(f); -// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_nextafter(f,f); __builtin_nextafterf(f,f); __builtin_nextafterl(f,f); __builtin_nextafterf128(f,f); @@ -663,25 +663,25 @@ __builtin_remquo(f,f,i); __builtin_remquof(f,f,i); __builtin_remquol(f,f,i); __ __builtin_rint(f); __builtin_rintf(f); __builtin_rintl(f); __builtin_rintf128(f); -// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_round(f); __builtin_roundf(f); __builtin_roundl(f); __builtin_roundf128(f); -// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]] __builtin_scalbln(f,f); __builtin_scalblnf(f,f); __builtin_scalblnl(f,f); __builtin_scalblnf128(f,f); @@ -707,10 +707,10 @@ __builtin_scalbn(f,f); __builtin_scalbnf(f,f); __builtin_scalbnl(f,f); __ __builtin_sin(f); __builtin_sinf(f); __builtin_sinl(f); __builtin_sinf128(f); -// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]] @@ -747,10 +747,10 @@ __builtin_sincospi(f,d,d); __builtin_sincospif(f,fp,fp); __builtin_sincospil(f,l __builtin_sqrt(f); __builtin_sqrtf(f); __builtin_sqrtl(f); __builtin_sqrtf128(f); -// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]] @@ -791,22 +791,24 @@ __builtin_tgamma(f); __builtin_tgammaf(f); __builtin_tgammal(f); __builti __builtin_trunc(f); __builtin_truncf(f); __builtin_truncl(f); __builtin_truncf128(f); -// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]] }; // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} } +// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} } // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} } // NO__ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} } // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} } // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} } +// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} } // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} } // HAS_ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} } // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} } diff --git a/clang/test/CodeGen/builtin-sqrt.c b/clang/test/CodeGen/builtin-sqrt.c index 2313a68d2d0e2..3ebf2ac91ccdf 100644 --- a/clang/test/CodeGen/builtin-sqrt.c +++ b/clang/test/CodeGen/builtin-sqrt.c @@ -11,5 +11,5 @@ float foo(float X) { // HAS_ERRNO-NOT: attributes [[ATTR]] = {{{.*}} memory(none) // NO_ERRNO: declare float @llvm.sqrt.f32(float) [[ATTR:#[0-9]+]] -// NO_ERRNO: attributes [[ATTR]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +// NO_ERRNO: attributes [[ATTR]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/clang/test/CodeGen/libcalls.c b/clang/test/CodeGen/libcalls.c index 1e4b06e34aaf9..923719f3ec8e4 100644 --- a/clang/test/CodeGen/libcalls.c +++ b/clang/test/CodeGen/libcalls.c @@ -74,9 +74,9 @@ void test_fma(float a0, double a1, long double a2) { // CHECK-YES: declare float @fmaf(float noundef, float noundef, float noundef) // CHECK-YES: declare double @fma(double noundef, double noundef, double noundef) // CHECK-YES: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) -// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RN2:#[0-9]+]] -// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RN2]] -// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RN2]] +// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RNI]] +// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RNI]] +// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RNI]] // Just checking to make sure these library functions are marked readnone void test_builtins(double d, float f, long double ld) { @@ -85,9 +85,9 @@ void test_builtins(double d, float f, long double ld) { double atan_ = atan(d); long double atanl_ = atanl(ld); float atanf_ = atanf(f); -// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RNI:#[0-9]+]] -// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RNI]] -// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RNI]] +// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RN2:#[0-9]+]] +// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RN2]] +// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RN2]] // CHECK-YES: declare double @atan(double noundef) [[NUW:#[0-9]+]] // CHECK-YES: declare x86_fp80 @atanl(x86_fp80 noundef) [[NUW]] // CHECK-YES: declare float @atanf(float noundef) [[NUW]] @@ -95,9 +95,9 @@ void test_builtins(double d, float f, long double ld) { double atan2_ = atan2(d, 2); long double atan2l_ = atan2l(ld, ld); float atan2f_ = atan2f(f, f); -// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RNI]] -// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RNI]] -// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RNI]] +// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RN2]] +// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RN2]] +// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RN2]] // CHECK-YES: declare double @atan2(double noundef, double noundef) [[NUW]] // CHECK-YES: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NUW]] // CHECK-YES: declare float @atan2f(float noundef, float noundef) [[NUW]] @@ -124,4 +124,4 @@ void test_builtins(double d, float f, long double ld) { } // CHECK-YES: attributes [[NUW]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" } -// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c index ad297828f48ed..d4cd6f86b3c51 100644 --- a/clang/test/CodeGen/math-libcalls.c +++ b/clang/test/CodeGen/math-libcalls.c @@ -35,27 +35,27 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { copysign(f,f); copysignf(f,f);copysignl(f,f); - // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]] - // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]] - // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]] - // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] - // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]] - // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]] - // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]] + // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]] + // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] + // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]] + // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]] + // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] + // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]] + // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]] + // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] fabs(f); fabsf(f); fabsl(f); - // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]] - // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]] - // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]] - // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]] - // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]] - // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]] + // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]] + // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]] + // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]] + // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]] + // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]] + // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]] + // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]] + // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]] frexp(f,i); frexpf(f,i); frexpl(f,i); @@ -86,7 +86,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { // NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] - // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] + // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]] // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_MAYTRAP: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] @@ -107,9 +107,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { pow(f,f); powf(f,f); powl(f,f); -// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]] @@ -206,21 +206,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { ceil(f); ceilf(f); ceill(f); -// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.ceil.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.ceil.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.ceil.f80( cos(f); cosf(f); cosl(f); -// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]] @@ -266,9 +266,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { exp(f); expf(f); expl(f); -// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]] @@ -278,9 +278,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { exp2(f); exp2f(f); exp2l(f); -// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]] @@ -314,21 +314,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { floor(f); floorf(f); floorl(f); -// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.floor.f64 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.floor.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.floor.f80( fma(f,f,f); fmaf(f,f,f); fmal(f,f,f); -// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]] @@ -350,39 +350,39 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { fmax(f,f); fmaxf(f,f); fmaxl(f,f); -// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.maxnum.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.maxnum.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.maxnum.f80( fmin(f,f); fminf(f,f); fminl(f,f); -// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.minnum.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.minnum.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.minnum.f80( fmaximum_num(*d,*d); fmaximum_numf(f,f); fmaximum_numl(*l,*l); -// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC]] -// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC]] -// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC2]] +// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC2]] +// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] fminimum_num(*d,*d); fminimum_numf(f,f); fminimum_numl(*l,*l); -// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC]] -// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC]] -// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]] +// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC2]] +// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC2]] +// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]] hypot(f,f); hypotf(f,f); hypotl(f,f); @@ -422,9 +422,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { llrint(f); llrintf(f); llrintl(f); -// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]] @@ -434,9 +434,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { llround(f); llroundf(f); llroundl(f); -// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]] @@ -446,9 +446,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { log(f); logf(f); logl(f); -// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]] @@ -458,9 +458,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { log10(f); log10f(f); log10l(f); -// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]] @@ -482,9 +482,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { log2(f); log2f(f); log2l(f); -// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]] @@ -506,9 +506,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { lrint(f); lrintf(f); lrintl(f); -// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]] @@ -518,9 +518,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { lround(f); lroundf(f); lroundl(f); -// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]] @@ -530,12 +530,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { nearbyint(f); nearbyintf(f); nearbyintl(f); -// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.nearbyint.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.nearbyint.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.nearbyint.f80( @@ -590,24 +590,24 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { rint(f); rintf(f); rintl(f); -// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.rint.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.rint.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.rint.f80( round(f); roundf(f); roundl(f); -// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_MAYTRAP: declare double @llvm.experimental.constrained.round.f64( // HAS_MAYTRAP: declare float @llvm.experimental.constrained.round.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.round.f80( @@ -638,9 +638,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { sin(f); sinf(f); sinl(f); -// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]] @@ -674,9 +674,9 @@ sincos(f, d, d); sincosf(f, fp, fp); sincosl(f, l, l); sqrt(f); sqrtf(f); sqrtl(f); -// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]] // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]] // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]] // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]] @@ -722,20 +722,22 @@ sincos(f, d, d); sincosf(f, fp, fp); sincosl(f, l, l); trunc(f); truncf(f); truncl(f); -// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]] -// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]] -// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]] +// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]] +// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]] }; // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} } +// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} } // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} } // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} } // NO__ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} } // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} } +// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} } // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} } // HAS_ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} } // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} } diff --git a/clang/test/CodeGen/ms-empty-enum.c b/clang/test/CodeGen/ms-empty-enum.c new file mode 100644 index 0000000000000..6c1c87b756f9a --- /dev/null +++ b/clang/test/CodeGen/ms-empty-enum.c @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -fms-extensions -triple x86_64-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fms-extensions -triple i386-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s + +typedef enum tag1 {} A; + +// CHECK: void @empty_enum(i32 noundef %a) +void empty_enum(A a) {} diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl new file mode 100644 index 0000000000000..a7c01015b2015 --- /dev/null +++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl @@ -0,0 +1,95 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case1v( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <6 x float> +// +float3x2 case1() { + // vec[0] = 0 + // vec[1] = 2 + // vec[2] = 4 + // vec[3] = 1 + // vec[4] = 3 + // vec[5] = 5 + return float3x2(0, 1, + 2, 3, + 4, 5); +} + + +RWStructuredBuffer In; + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case2v( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 0) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: [[CALL1:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 1) #[[ATTR3]] +// CHECK-NEXT: [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 2) #[[ATTR3]] +// CHECK-NEXT: [[CALL3:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 3) #[[ATTR3]] +// CHECK-NEXT: [[CALL4:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 4) #[[ATTR3]] +// CHECK-NEXT: [[CALL5:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 5) #[[ATTR3]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[CALL]], align 4 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[CALL2]], align 4 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT]], float [[TMP1]], i32 1 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[CALL4]], align 4 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[TMP2]], i32 2 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[CALL1]], align 4 +// CHECK-NEXT: [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT7]], float [[TMP3]], i32 3 +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[CALL3]], align 4 +// CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[TMP4]], i32 4 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[CALL5]], align 4 +// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT9]], float [[TMP5]], i32 5 +// CHECK-NEXT: ret <6 x float> [[VECINIT10]] +// +float3x2 case2() { + // vec[0] = Call + // vec[1] = Call2 + // vec[2] = Call4 + // vec[3] = Call1 + // vec[4] = Call3 + // vec[5] = Call5 + return float3x2(In[0], In[1], + In[2], In[3], + In[4], In[5]); +} + + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case3Dv3_fS_( +// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[A:%.*]], <3 x float> noundef nofpclass(nan inf) [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x float>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <3 x float>, align 16 +// CHECK-NEXT: store <3 x float> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <3 x float> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT:%.*]] = extractelement <3 x float> [[TMP0]], i64 0 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[VECEXT]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <3 x float> [[TMP1]], i64 2 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT]], float [[VECEXT1]], i32 1 +// CHECK-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <3 x float> [[TMP2]], i64 1 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[VECEXT3]], i32 2 +// CHECK-NEXT: [[TMP3:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT5:%.*]] = extractelement <3 x float> [[TMP3]], i64 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT4]], float [[VECEXT5]], i32 3 +// CHECK-NEXT: [[TMP4:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT7:%.*]] = extractelement <3 x float> [[TMP4]], i64 0 +// CHECK-NEXT: [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[VECEXT7]], i32 4 +// CHECK-NEXT: [[TMP5:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[VECEXT9:%.*]] = extractelement <3 x float> [[TMP5]], i64 2 +// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[VECEXT9]], i32 5 +// CHECK-NEXT: ret <6 x float> [[VECINIT10]] +// +float3x2 case3(float3 a, float3 b) { + // vec[0] = A[0] + // vec[1] = A[2] + // vec[2] = B[1] + // vec[3] = A[1] + // vec[4] = B[0] + // vec[5] = B[2] + return float3x2(a,b); +} diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl new file mode 100644 index 0000000000000..65dba664bb5ea --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s + +// CHECK: define hidden noundef nofpclass(nan inf) float +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0) +// CHECK: ret float %hlsl.f16tof32 +// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32) +float test_scalar(uint p0) { return __builtin_hlsl_elementwise_f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <2 x float> +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0) +// CHECK: ret <2 x float> %hlsl.f16tof32 +// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>) +float2 test_uint2(uint2 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 { +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0) +// CHECK: ret <3 x float> %hlsl.f16tof32 +// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>) +float3 test_uint3(uint3 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 { +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0) +// CHECK: ret <4 x float> %hlsl.f16tof32 +// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>) +float4 test_uint4(uint4 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); } + + + diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl new file mode 100644 index 0000000000000..b68bc197f16c5 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \ +// RUN: -o - | FileCheck %s + +// CHECK: define hidden noundef nofpclass(nan inf) float +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0) +// CHECK: ret float %hlsl.f16tof32 +// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32) +float test_scalar(uint p0) { return f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <2 x float> +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0) +// CHECK: ret <2 x float> %hlsl.f16tof32 +// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>) +float2 test_uint2(uint2 p0) { return f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 { +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0) +// CHECK: ret <3 x float> %hlsl.f16tof32 +// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>) +float3 test_uint3(uint3 p0) { return f16tof32(p0); } + +// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 { +// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0) +// CHECK: ret <4 x float> %hlsl.f16tof32 +// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>) +float4 test_uint4(uint4 p0) { return f16tof32(p0); } + + + diff --git a/clang/test/CodeGenHLSL/builtins/select.hlsl b/clang/test/CodeGenHLSL/builtins/select.hlsl index 7590b4a881259..e5169844cb3f2 100644 --- a/clang/test/CodeGenHLSL/builtins/select.hlsl +++ b/clang/test/CodeGenHLSL/builtins/select.hlsl @@ -20,16 +20,6 @@ struct S test_select_infer_struct(bool cond0, struct S tVal, struct S fVal) { return select(cond0, tVal, fVal); } -// CHECK-LABEL: test_select_infer_array -// CHECK: [[TRUE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4 -// CHECK: [[FALSE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4 -// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, [3 x i32] [[TRUE_VAL]], [3 x i32] [[FALSE_VAL]] -// CHECK: store [3 x i32] [[SELECT]], ptr {{%.*}}, align 4 -// CHECK: ret void -int test_select_infer_array(bool cond, int tVal[3], int fVal[3])[3] { - return select(cond, tVal, fVal); -} - // CHECK-LABEL: test_select_bool_vector // CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}} // CHECK: ret <2 x i32> [[SELECT]] @@ -38,24 +28,24 @@ int2 test_select_bool_vector(bool cond0, int2 tVal, int2 fVal) { } // CHECK-LABEL: test_select_vector_1 -// CHECK: [[SELECT:%.*]] = select <1 x i1> {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}} +// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}} // CHECK: ret <1 x i32> [[SELECT]] int1 test_select_vector_1(bool1 cond0, int1 tVals, int1 fVals) { - return select(cond0, tVals, fVals); + return select(cond0, tVals, fVals); } // CHECK-LABEL: test_select_vector_2 // CHECK: [[SELECT:%.*]] = select <2 x i1> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}} // CHECK: ret <2 x i32> [[SELECT]] int2 test_select_vector_2(bool2 cond0, int2 tVals, int2 fVals) { - return select(cond0, tVals, fVals); + return select(cond0, tVals, fVals); } // CHECK-LABEL: test_select_vector_3 // CHECK: [[SELECT:%.*]] = select <3 x i1> {{%.*}}, <3 x i32> {{%.*}}, <3 x i32> {{%.*}} // CHECK: ret <3 x i32> [[SELECT]] int3 test_select_vector_3(bool3 cond0, int3 tVals, int3 fVals) { - return select(cond0, tVals, fVals); + return select(cond0, tVals, fVals); } // CHECK-LABEL: test_select_vector_4 @@ -86,10 +76,54 @@ int4 test_select_vector_vector_scalar(bool4 cond0, int4 tVals, int fVal) { // CHECK-LABEL: test_select_vector_scalar_scalar // CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0 // CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer -// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 %3, i64 0 +// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0 // CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK: [[SELECT:%.*]] = select <4 x i1> {{%.*}}, <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]] // CHECK: ret <4 x i32> [[SELECT]] int4 test_select_vector_scalar_scalar(bool4 cond0, int tVal, int fVal) { return select(cond0, tVal, fVal); } + +// CHECK-LABEL: test_select_nonbool_cond_vector_4 +// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16 +// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer +// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i1> {{%.*}}, <4 x i1> {{%.*}} +// CHECK: ret <4 x i1> [[SELECT]] +bool4 test_select_nonbool_cond_vector_4(int4 cond0, bool4 tVal, bool4 fVal) { + return select(cond0, tVal, fVal); +} + +// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_vector +// CHECK: [[TMP0:%.*]] = load <3 x i32>, ptr %cond0.addr, align 16 +// CHECK: [[TOBOOL:%.*]] = icmp ne <3 x i32> [[TMP0]], zeroinitializer +// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <3 x i32> poison, i32 {{%.*}}, i64 0 +// CHECK: [[SPLAT1:%.*]] = shufflevector <3 x i32> [[SPLAT_SRC1]], <3 x i32> poison, <3 x i32> zeroinitializer +// CHECK: [[SELECT:%.*]] = select <3 x i1> [[TOBOOL]], <3 x i32> [[SPLAT1]], <3 x i32> {{%.*}} +// CHECK: ret <3 x i32> [[SELECT]] +int3 test_select_nonbool_cond_vector_scalar_vector(int3 cond0, int tVal, int3 fVal) { + return select(cond0, tVal, fVal); +} + +// CHECK-LABEL: test_select_nonbool_cond_vector_vector_scalar +// CHECK: [[TMP0:%.*]] = load <2 x i32>, ptr %cond0.addr, align 8 +// CHECK: [[TOBOOL:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer +// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <2 x i32> poison, i32 {{%.*}}, i64 0 +// CHECK: [[SPLAT1:%.*]] = shufflevector <2 x i32> [[SPLAT_SRC1]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK: [[SELECT:%.*]] = select <2 x i1> [[TOBOOL]], <2 x i32> {{%.*}}, <2 x i32> [[SPLAT1]] +// CHECK: ret <2 x i32> [[SELECT]] +int2 test_select_nonbool_cond_vector_vector_scalar(int2 cond0, int2 tVal, int fVal) { + return select(cond0, tVal, fVal); +} + +// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_scalar +// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16 +// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer +// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0 +// CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0 +// CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]] +// CHECK: ret <4 x i32> [[SELECT]] +int4 test_select_nonbool_cond_vector_scalar_scalar(int4 cond0, int tVal, int fVal) { + return select(cond0, tVal, fVal); +} diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl index ea1f734391614..5cbf6452d4c85 100644 --- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl +++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl @@ -199,7 +199,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // SPIR32: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } // SPIR32: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } // SPIR32: attributes #[[ATTR2]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } // SPIR32: attributes #[[ATTR4]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // SPIR32: attributes #[[ATTR5]] = { convergent nounwind "uniform-work-group-size"="true" } //. diff --git a/clang/test/DebugInfo/KeyInstructions/flag.cpp b/clang/test/DebugInfo/KeyInstructions/flag.cpp index 6aeeed664135e..4a4a5c4c142a7 100644 --- a/clang/test/DebugInfo/KeyInstructions/flag.cpp +++ b/clang/test/DebugInfo/KeyInstructions/flag.cpp @@ -8,6 +8,9 @@ // KEY-INSTRUCTIONS: "-gkey-instructions" // NO-KEY-INSTRUCTIONS-NOT: key-instructions + +// Only expect one dwarf related flag. +// NO-DEBUG: -fdwarf2-cfi-asm // NO-DEBUG-NOT: debug-info-kind // NO-DEBUG-NOT: dwarf diff --git a/clang/test/Driver/hip-spirv-translator-new-driver.c b/clang/test/Driver/hip-spirv-translator-new-driver.c new file mode 100644 index 0000000000000..67d894e2eb506 --- /dev/null +++ b/clang/test/Driver/hip-spirv-translator-new-driver.c @@ -0,0 +1,9 @@ +// The --offload-new-driver was crashing when using -save-temps due to a failure in clang-linker-wrapper. +// The input and output files cannot be the same. + +// RUN: %clang --offload-new-driver -### -save-temps -nogpuinc -nogpulib \ +// RUN: --target=x86_64-unknown-linux-gnu --offload-arch=amdgcnspirv -x hip %s 2>&1 \ +// RUN: | FileCheck %s + +// CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]" +// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}} diff --git a/clang/test/Driver/mg.c b/clang/test/Driver/mg.c index 82d8a6084e5e0..b7458a08698d3 100644 --- a/clang/test/Driver/mg.c +++ b/clang/test/Driver/mg.c @@ -1,5 +1,7 @@ -// RUN: %clang -M -MG -include nonexistent-preinclude.h %s | FileCheck %s +// RUN: %clang -M -MG -include nonexistent-preinclude.h -std=c23 %s | FileCheck %s // CHECK: nonexistent-preinclude.h // CHECK: nonexistent-ppinclude.h +// CHECK: nonexistent-embed #include "nonexistent-ppinclude.h" +#embed "nonexistent-embed" diff --git a/clang/test/Driver/no-gpu-bundle-respected.hip b/clang/test/Driver/no-gpu-bundle-respected.hip new file mode 100644 index 0000000000000..fc93640dc4b90 --- /dev/null +++ b/clang/test/Driver/no-gpu-bundle-respected.hip @@ -0,0 +1,24 @@ +// RUN: %clang -ccc-print-phases -c -emit-llvm \ +// RUN: --offload-arch=gfx900,gfx1030 -O3 -x hip %s \ +// RUN: 2>&1 | FileCheck %s --check-prefix=BUNDLE + +// RUN: %clang -ccc-print-phases -c -emit-llvm \ +// RUN: --gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \ +// RUN: 2>&1 | FileCheck %s --check-prefix=BUNDLE + +// RUN: %clang -ccc-print-phases -c -emit-llvm \ +// RUN: --no-gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \ +// RUN: 2>&1 | FileCheck %s --check-prefixes=COMPILER,GFX1030,GFX900,OFFLOAD,NOBUNDLE + +// BUNDLE: clang-offload-bundler +// NOBUNDLE-NOT: clang-offload-bundler + +// COM: sanity checks +// COMPILER: compiler +// GFX1030: (device-hip, gfx1030) +// GFX900: (device-hip, gfx900) +// OFFLOAD: offload + +int square(int num) { + return num * num; +} diff --git a/clang/test/Frontend/diags-interesting-source-region-colors.cpp b/clang/test/Frontend/diags-interesting-source-region-colors.cpp new file mode 100644 index 0000000000000..80db0873b9e0a --- /dev/null +++ b/clang/test/Frontend/diags-interesting-source-region-colors.cpp @@ -0,0 +1,30 @@ +// RUN: not %clang_cc1 %s -fmessage-length=40 -fcolor-diagnostics -fno-show-source-location -Wunused-value -o - 2>&1 | FileCheck %s + +// REQUIRES: ansi-escape-sequences + +int main() { + 1 + + if; + // CHECK: expected expression + // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]]; + + /*😂*/1 + + if; + // CHECK: expected expression + // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]]; + + a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1; + // CHECK: use of undeclared identifier + // CHECK-NEXT: a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] ... + + + /*😂😂😂*/ a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1; + // CHECK: use of undeclared identifier + // CHECK-NEXT: [[YELLOW:.\[0;33m]]/*😂😂😂*/[[RESET]] a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] ... + + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + // CHECK: [[GREEN:.\[0;32m]]"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]]; + + "😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + // CHECK: [[GREEN:.\[0;32m]]"😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]]; +} + + diff --git a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap index 186965177caaf..8ab6ae4779ea9 100644 --- a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap +++ b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap @@ -49,6 +49,11 @@ module cstd [system] [no_undeclared_includes] { export * } + module stdckdint { + header "stdckdint.h" + export * + } + module stdcountof { header "stdcountof.h" export * diff --git a/clang/test/Modules/builtin-headers.mm b/clang/test/Modules/builtin-headers.mm index ad2d66ae38dfd..6cd366228172e 100644 --- a/clang/test/Modules/builtin-headers.mm +++ b/clang/test/Modules/builtin-headers.mm @@ -17,6 +17,7 @@ @import _Builtin_stdarg; @import _Builtin_stdatomic; @import _Builtin_stdbool; +@import _Builtin_stdckdint; @import _Builtin_stdcountof; @import _Builtin_stddef; @import _Builtin_stdint; diff --git a/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm new file mode 100644 index 0000000000000..90c57796dcf7e --- /dev/null +++ b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm @@ -0,0 +1,46 @@ +// Fixes #165445 + +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 -x c++-user-header %t/header.h \ +// RUN: -emit-header-unit -o %t/header.pcm +// +// RUN: %clang_cc1 -std=c++20 %t/A.cppm -fmodule-file=%t/header.pcm \ +// RUN: -emit-module-interface -o %t/A.pcm +// +// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fmodule-file=%t/header.pcm \ +// RUN: -emit-module-interface -o %t/B.pcm +// +// RUN: %clang_cc1 -std=c++20 %t/use.cpp \ +// RUN: -fmodule-file=A=%t/A.pcm -fmodule-file=B=%t/B.pcm \ +// RUN: -fmodule-file=%t/header.pcm \ +// RUN: -verify -fsyntax-only + +//--- enum.h +enum E { Value }; + +//--- header.h +#include "enum.h" + +//--- A.cppm +module; +#include "enum.h" +export module A; + +auto e = Value; + +//--- B.cppm +export module B; +import "header.h"; + +auto e = Value; + +//--- use.cpp +// expected-no-diagnostics +import A; +import B; +#include "enum.h" + +auto e = Value; diff --git a/clang/test/OpenMP/metadirective_ast_print.c b/clang/test/OpenMP/metadirective_ast_print.c index 638dbae1bc774..75ef5fa26827c 100644 --- a/clang/test/OpenMP/metadirective_ast_print.c +++ b/clang/test/OpenMP/metadirective_ast_print.c @@ -2,17 +2,25 @@ // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT -// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN +// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU -// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN +// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU + +// RUN: %clang_cc1 -verify -fopenmp -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU + +// RUN: %clang_cc1 -verify -fopenmp-simd -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52 -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU + +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU // expected-no-diagnostics #ifndef HEADER @@ -77,6 +85,12 @@ void foo1(void) { for (int i = 0; i < 100; i++) ; +#pragma omp metadirective when(device={arch("spirv64")}: \ + teams distribute parallel for)\ + otherwise(parallel for) + for (int i = 0; i < 100; i++) + ; + #pragma omp metadirective when(implementation = {extension(match_all)} \ : nothing) otherwise(parallel for) for (int i = 0; i < 16; i++) @@ -134,8 +148,8 @@ void foo1(void) { // OMP52-NEXT: for (int i = 0; i < 16; i++) { // OMP52-NEXT: #pragma omp simd // OMP52-NEXT: for (int j = 0; j < 16; j++) -// OMP52-AMDGCN: #pragma omp teams distribute parallel for -// OMP52-AMDGCN-NEXT: for (int i = 0; i < 100; i++) +// OMP52-GPU: #pragma omp teams distribute parallel for +// OMP52-GPU-NEXT: for (int i = 0; i < 100; i++) // OMP52: for (int i = 0; i < 16; i++) // OMP52: for (int i = 0; i < 16; i++) @@ -198,6 +212,12 @@ void foo2(void) { for (int i = 0; i < 100; i++) ; +#pragma omp metadirective when(device={arch("spirv64")}: \ + teams distribute parallel for)\ + default(parallel for) + for (int i = 0; i < 100; i++) + ; + #pragma omp metadirective when(implementation = {extension(match_all)} \ : nothing) default(parallel for) for (int i = 0; i < 16; i++) @@ -266,8 +286,8 @@ void foo2(void) { // DEFAULT-NEXT: for (int i = 0; i < 16; i++) { // DEFAULT-NEXT: #pragma omp simd // DEFAULT-NEXT: for (int j = 0; j < 16; j++) -// DEFAULT-AMDGCN: #pragma omp teams distribute parallel for -// DEFAULT-AMDGCN-NEXT: for (int i = 0; i < 100; i++) +// DEFAULT-GPU: #pragma omp teams distribute parallel for +// DEFAULT-GPU-NEXT: for (int i = 0; i < 100; i++) // DEFAULT: for (int i = 0; i < 16; i++) // DEFAULT: for (int i = 0; i < 16; i++) diff --git a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp index eecae310d0a77..1d5584de67162 100644 --- a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp +++ b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp @@ -1,7 +1,7 @@ -// REQUIRES: amdgpu-registered-target - // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -target-cpu gfx906 -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-ppc-spirv-host.bc +// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-spirv-host.bc -o - | FileCheck %s // expected-no-diagnostics @@ -16,6 +16,12 @@ Inspired from SOLLVE tests: #define N 1024 +#ifdef __AMDGPU__ +#define GPU "amdgcn" +#else +#define GPU "spirv64" +#endif + int metadirective1() { int v1[N], v2[N], v3[N]; @@ -26,7 +32,7 @@ int metadirective1() { #pragma omp target map(to:v1,v2) map(from:v3, target_device_num) device(default_device) { #pragma omp metadirective \ - when(device={arch("amdgcn")}: teams distribute parallel for) \ + when(device={arch(GPU)}: teams distribute parallel for) \ default(parallel for) for (int i = 0; i < N; i++) { @@ -38,28 +44,28 @@ int metadirective1() { return errors; } -// CHECK: define weak_odr protected amdgpu_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]] +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]] // CHECK: entry: -// CHECK: %{{[0-9]}} = call i32 @__kmpc_target_init +// CHECK: %{{[0-9]}} = call{{.*}} i32 @__kmpc_target_init // CHECK: user_code.entry: -// CHECK: call void @[[METADIRECTIVE]]_omp_outlined -// CHECK-NOT: call void @__kmpc_parallel_51 +// CHECK: call{{.*}} void @[[METADIRECTIVE]]_omp_outlined +// CHECK-NOT: call{{.*}} void @__kmpc_parallel_51 // CHECK: ret void // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined // CHECK: entry: -// CHECK: call void @__kmpc_distribute_static_init +// CHECK: call{{.*}} void @__kmpc_distribute_static_init // CHECK: omp.loop.exit: -// CHECK: call void @__kmpc_distribute_static_fini +// CHECK: call{{.*}} void @__kmpc_distribute_static_fini // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined_omp_outlined // CHECK: entry: -// CHECK: call void @__kmpc_for_static_init_4 +// CHECK: call{{.*}} void @__kmpc_for_static_init_4 // CHECK: omp.inner.for.body: // CHECK: store atomic {{.*}} monotonic // CHECK: omp.loop.exit: -// CHECK-NEXT: call void @__kmpc_for_static_fini +// CHECK-NEXT: call{{.*}} void @__kmpc_for_static_fini // CHECK-NEXT: ret void diff --git a/clang/test/OpenMP/thread_limit_amdgpu.c b/clang/test/OpenMP/thread_limit_amdgpu.c deleted file mode 100644 index f884eeb73c3ff..0000000000000 --- a/clang/test/OpenMP/thread_limit_amdgpu.c +++ /dev/null @@ -1,34 +0,0 @@ -// Test target codegen - host bc file has to be created first. -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -// expected-no-diagnostics - -#ifndef HEADER -#define HEADER - -void foo(int N) { -#pragma omp target teams distribute parallel for simd - for (int i = 0; i < N; ++i) - ; -#pragma omp target teams distribute parallel for simd thread_limit(4) - for (int i = 0; i < N; ++i) - ; -#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) - for (int i = 0; i < N; ++i) - ; -#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22) - for (int i = 0; i < N; ++i) - ; -} - -#endif - -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l10({{.*}}) #[[ATTR1:.+]] { -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l13({{.*}}) #[[ATTR2:.+]] { -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l16({{.*}}) #[[ATTR3:.+]] { -// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l19({{.*}}) #[[ATTR4:.+]] { - -// CHECK: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} } -// CHECK: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} } -// CHECK: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} } -// CHECK: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} } diff --git a/clang/test/OpenMP/thread_limit_gpu.c b/clang/test/OpenMP/thread_limit_gpu.c new file mode 100644 index 0000000000000..4bcc14d070c22 --- /dev/null +++ b/clang/test/OpenMP/thread_limit_gpu.c @@ -0,0 +1,41 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-AMDGPU %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-x86-spirv-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-spirv-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-SPIRV %s +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +void foo(int N) { +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < N; ++i) + ; +#pragma omp target teams distribute parallel for simd thread_limit(4) + for (int i = 0; i < N; ++i) + ; +#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) + for (int i = 0; i < N; ++i) + ; +#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22) + for (int i = 0; i < N; ++i) + ; +} + +#endif + +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l12({{.*}}) #[[ATTR1:.+]] { +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l15({{.*}}) #[[ATTR2:.+]] { +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l18({{.*}}) #[[ATTR3:.+]] { +// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l21({{.*}}) #[[ATTR4:.+]] { + +// CHECK-AMDGPU: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} } +// CHECK-AMDGPU: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} } +// CHECK-AMDGPU: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} } +// CHECK-AMDGPU: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} } + +// CHECK-SPIRV: attributes #[[ATTR1]] = { {{.*}} "omp_target_thread_limit"="256" {{.*}} } +// CHECK-SPIRV: attributes #[[ATTR2]] = { {{.*}} "omp_target_thread_limit"="4" {{.*}} } +// CHECK-SPIRV: attributes #[[ATTR3]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="42" {{.*}} } +// CHECK-SPIRV: attributes #[[ATTR4]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="22" {{.*}} } diff --git a/clang/test/Parser/ms-empty-enum.c b/clang/test/Parser/ms-empty-enum.c new file mode 100644 index 0000000000000..790547af88bab --- /dev/null +++ b/clang/test/Parser/ms-empty-enum.c @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions + +typedef enum tag1 { } A; // expected-warning {{empty enumeration types are a Microsoft extension}} +typedef enum tag2 { } B; // expected-warning {{empty enumeration types are a Microsoft extension}} +typedef enum : unsigned { } C; // expected-warning {{enumeration types with a fixed underlying type are a Microsoft extension}}\ + // expected-warning {{empty enumeration types are a Microsoft extension}} diff --git a/clang/test/Preprocessor/unwind-tables.c b/clang/test/Preprocessor/unwind-tables.c index 0a863d79adbf6..5ff990d0c40a6 100644 --- a/clang/test/Preprocessor/unwind-tables.c +++ b/clang/test/Preprocessor/unwind-tables.c @@ -1,11 +1,13 @@ // RUN: %clang %s -dM -E -target x86_64-windows | FileCheck %s --check-prefix=NO // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables | FileCheck %s --check-prefix=NO +// RUN: %clang %s -dM -E -target x86_64 -fno-dwarf2-cfi-asm | FileCheck %s --check-prefix=NO // RUN: %clang %s -dM -E -target x86_64 | FileCheck %s // RUN: %clang %s -dM -E -target x86_64 -funwind-tables -fno-asynchronous-unwind-tables -g | FileCheck %s // RUN: %clang %s -dM -E -target aarch64-apple-darwin | FileCheck %s // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -g | FileCheck %s // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -fexceptions | FileCheck %s +// RUN: %clang %s -dM -E -target x86_64-windows -fdwarf2-cfi-asm | FileCheck %s // NO-NOT: #define __GCC_HAVE_DWARF2_CFI_ASM // CHECK: #define __GCC_HAVE_DWARF2_CFI_ASM 1 diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp index b26a945843696..881e816292d59 100644 --- a/clang/test/Sema/attr-nonblocking-constraints.cpp +++ b/clang/test/Sema/attr-nonblocking-constraints.cpp @@ -235,16 +235,35 @@ void nb13() [[clang::nonblocking]] { nb12(); } // C++ member function pointers struct PTMFTester { typedef void (PTMFTester::*ConvertFunction)() [[clang::nonblocking]]; - - void convert() [[clang::nonblocking]]; + typedef void (PTMFTester::*BlockingFunction)(); ConvertFunction mConvertFunc; -}; -void PTMFTester::convert() [[clang::nonblocking]] -{ - (this->*mConvertFunc)(); -} + void convert() [[clang::nonblocking]] + { + (this->*mConvertFunc)(); // This should not generate a warning. + } + + template + struct Holder { + T value; + + T& operator*() { return value; } + }; + + + void ptmfInExpr(Holder& holder) [[clang::nonblocking]] + { + (this->*(*holder))(); // Should not generate a warning. + ((*this).*(*holder))(); // Should not generate a warning. + } + + void ptmfInExpr(Holder& holder) [[clang::nonblocking]] + { + (this->*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}} + ((*this).*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}} + } +}; // Allow implicit conversion from array to pointer. void nb14(unsigned idx) [[clang::nonblocking]] @@ -354,6 +373,33 @@ struct Unsafe { Unsafe(float y) [[clang::nonblocking]] : Unsafe(int(y)) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}} }; +// Exercise cases of a temporary with a safe constructor and unsafe destructor. +void nb23() +{ + struct X { + int *ptr = nullptr; + X() {} + ~X() { delete ptr; } // expected-note 2 {{destructor cannot be inferred 'nonblocking' because it allocates or deallocates memory}} + }; + + auto inner = []() [[clang::nonblocking]] { + X(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}} + }; + + auto inner2 = [](X x) [[clang::nonblocking]] { // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}} + }; + +} + +struct S2 { ~S2(); }; // expected-note 2 {{declaration cannot be inferred 'nonblocking' because it has no definition in this translation unit}} +void nb24() { + S2 s; + [&]() [[clang::nonblocking]] { + [s]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}} + [=]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}} + }(); +} + struct DerivedFromUnsafe : public Unsafe { DerivedFromUnsafe() [[clang::nonblocking]] {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}} DerivedFromUnsafe(int x) [[clang::nonblocking]] : Unsafe(x) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}} diff --git a/clang/test/Sema/labeled-break-continue.c b/clang/test/Sema/labeled-break-continue.c index 78f81c484c3d5..6b4adc23dca8d 100644 --- a/clang/test/Sema/labeled-break-continue.c +++ b/clang/test/Sema/labeled-break-continue.c @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -std=c2y -verify -fsyntax-only -fblocks %s -// RUN: %clang_cc1 -std=c23 -verify -fsyntax-only -fblocks -fnamed-loops %s -// RUN: %clang_cc1 -x c++ -verify -fsyntax-only -fblocks -fnamed-loops %s +// RUN: %clang_cc1 -std=c2y -verify -Wunused -fsyntax-only -fblocks %s +// RUN: %clang_cc1 -std=c23 -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s +// RUN: %clang_cc1 -x c++ -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s void f1() { l1: while (true) { @@ -159,3 +159,15 @@ void f7() { continue d; // expected-error {{'continue' label does not name an enclosing loop}} } } + +void f8() { + l1: // no-warning + while (true) { + break l1; + } + + l2: // no-warning + while (true) { + continue l2; + } +} diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp index bea90ff7eaf8a..1fc6e5ec4cc55 100644 --- a/clang/test/SemaCXX/constant-expression-cxx14.cpp +++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp @@ -1450,3 +1450,9 @@ namespace GH149500 { unsigned int * p = &(*(unsigned int *)0x400); static const void *q = &(*(const struct sysrq_key_op *)0); } + +constexpr bool missingCase() { + switch (1) { + 1u: return false; // expected-error {{expected 'case' keyword before expression}} + } +} diff --git a/clang/test/SemaCXX/vector.cpp b/clang/test/SemaCXX/vector.cpp index 808bdb679b09c..06195f039cd92 100644 --- a/clang/test/SemaCXX/vector.cpp +++ b/clang/test/SemaCXX/vector.cpp @@ -786,3 +786,16 @@ const long long e = *0; // expected-error {{indirection requires pointer operand double f = a - e; // expected-error {{cannot initialize a variable of type 'double' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(double)))) double' (vector of 1 'double' value)}} int h = c - e; // expected-error {{cannot initialize a variable of type 'int' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(long)))) long' (vector of 1 'long' value)}} } + +typedef int v_neg_size __attribute__((vector_size(-8))); // expected-error{{vector must have non-negative size}} +typedef int v_neg_size_2 __attribute__((vector_size(-1 * 8))); // expected-error{{vector must have non-negative size}} +typedef int v_ext_neg_size __attribute__((ext_vector_type(-8))); // expected-error{{vector must have non-negative size}} +typedef int v_ext_neg_size2 __attribute__((ext_vector_type(-1 * 8))); // expected-error{{vector must have non-negative size}} + + +#if __cplusplus >= 201103L + +template using templated_v_size = int __attribute__((vector_size(N))); // expected-error{{vector must have non-negative size}} +templated_v_size<-8> templated_v_neg_size; //expected-note{{in instantiation of template type alias 'templated_v_size' requested here}} + +#endif diff --git a/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl new file mode 100644 index 0000000000000..8f2f9308ed966 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl @@ -0,0 +1,134 @@ +// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify + +float builtin_f16tof32_too_few_arg() { + return __builtin_hlsl_elementwise_f16tof32(); + // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} + // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 0 were provided}} +} + +float builtin_f16tof32_too_many_arg(uint p0) { + return __builtin_hlsl_elementwise_f16tof32(p0, p0); + // expected-error@-1 {{too many arguments to function call, expected 1, have 2}} + // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 2 were provided}} +} + +float builtin_f16tof32_bool(bool p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}} +} + +float builtin_f16tof32_bool4(bool4 p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool4' (aka 'vector')}} +} + +float builtin_f16tof32_short(short p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}} +} + +float builtin_f16tof32_unsigned_short(unsigned short p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}} +} + +float builtin_f16tof32_int(int p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}} +} + +float builtin_f16tof32_int64_t(long p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}} +} + +float2 builtin_f16tof32_int2_to_float2_promotion(int2 p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int2' (aka 'vector'))}} +} + +float builtin_f16tof32_half(half p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}} +} + +float builtin_f16tof32_half4(half4 p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half4' (aka 'vector'))}} +} + +float builtin_f16tof32_float(float p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}} +} + +float builtin_f16tof32_double(double p0) { + return __builtin_hlsl_elementwise_f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}} +} + +float f16tof32_too_few_arg() { + return f16tof32(); + // expected-error@-1 {{no matching function for call to 'f16tof32'}} +} + +float f16tof32_too_many_arg(uint p0) { + return f16tof32(p0, p0); + // expected-error@-1 {{no matching function for call to 'f16tof32'}} +} + +float f16tof32_bool(bool p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}} +} + +float f16tof32_bool3(bool3 p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool3' (aka 'vector'))}} +} + + +float f16tof32_int16_t(short p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}} +} + +float f16tof32_int16_t(unsigned short p0) { + return f16tof32(p0); + // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}} +} + +float f16tof32_int(int p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}} +} + +float f16tof32_int64_t(long p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}} +} + +float2 f16tof32_int2_to_float2_promotion(int3 p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int3' (aka 'vector'))}} +} + +float f16tof32_half(half p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}} +} + +float f16tof32_half2(half2 p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half2' (aka 'vector'))}} +} + +float f16tof32_float(float p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}} +} + +float f16tof32_double(double p0) { + return f16tof32(p0); + // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}} +} diff --git a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl index 12c818acec035..b2f45051a9bd8 100644 --- a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl @@ -15,7 +15,7 @@ int2 test_select_vector_vals_not_vecs(bool2 p0, int t0, } int1 test_select_vector_vals_wrong_size(bool2 p0, int1 t0, int1 f0) { - return select(p0, t0, f0); // expected-warning{{implicit conversion truncates vector: 'bool2' (aka 'vector') to 'vector' (vector of 1 'bool' value)}} + return select(p0, t0, f0); // No diagnostic expected. } int test_select_no_args() { diff --git a/clang/test/SemaTemplate/ctad.cpp b/clang/test/SemaTemplate/ctad.cpp index 1a575ea527006..60603f0c963a5 100644 --- a/clang/test/SemaTemplate/ctad.cpp +++ b/clang/test/SemaTemplate/ctad.cpp @@ -104,3 +104,15 @@ namespace ConvertDeducedTemplateArgument { auto x = C(D()); } + +namespace pr165560 { +template struct S { + using A = T; + template struct I { // expected-note{{candidate function template not viable: requires 1 argument, but 0 were provided}} \ + // expected-note{{implicit deduction guide declared as 'template I(pr165560::S::I) -> pr165560::S::I'}} + I(typename A::F) {} // expected-error{{type 'A' (aka 'int') cannot be used prior to '::' because it has no members}} + }; +}; +S::I i; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'S::I'}} \ + // expected-note{{while building implicit deduction guide first needed here}} +} diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp index ef229606de0f0..8fc9a66dbda7e 100644 --- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp +++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp @@ -2076,4 +2076,19 @@ TEST(ExprMutationAnalyzerTest, PointeeMutatedByReturn) { } } +TEST(ExprMutationAnalyzerTest, PointeeMutatedByPointerToMemberOperator) { + // GH161913 + const std::string Code = R"( + struct S { int i; }; + void f(S s) { + S *x = &s; + (x->*(&S::i))++; + } + )"; + auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"}); + auto Results = + match(withEnclosingCompound(declRefTo("x")), AST->getASTContext()); + EXPECT_TRUE(isPointeeMutated(Results, AST.get())); +} + } // namespace clang diff --git a/clang/unittests/CodeGen/CMakeLists.txt b/clang/unittests/CodeGen/CMakeLists.txt index f5bcecb0b08a3..d4efb2230a054 100644 --- a/clang/unittests/CodeGen/CMakeLists.txt +++ b/clang/unittests/CodeGen/CMakeLists.txt @@ -1,6 +1,7 @@ add_clang_unittest(ClangCodeGenTests BufferSourceTest.cpp CodeGenExternalTest.cpp + DemangleTrapReasonInDebugInfo.cpp TBAAMetadataTest.cpp CheckTargetFeaturesTest.cpp CLANG_LIBS diff --git a/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp new file mode 100644 index 0000000000000..17bfe17c31d65 --- /dev/null +++ b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp @@ -0,0 +1,67 @@ +//=== unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/CodeGen/ModuleBuilder.h" +#include "llvm/ADT/StringRef.h" +#include "gtest/gtest.h" + +using namespace clang::CodeGen; + +void CheckValidCommon(llvm::StringRef FuncName, const char *ExpectedCategory, + const char *ExpectedMessage) { + auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName); + ASSERT_TRUE(MaybeTrapReason.has_value()); + auto [Category, Message] = MaybeTrapReason.value(); + ASSERT_STREQ(Category.str().c_str(), ExpectedCategory); + ASSERT_STREQ(Message.str().c_str(), ExpectedMessage); +} + +void CheckInvalidCommon(llvm::StringRef FuncName) { + auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName); + ASSERT_TRUE(!MaybeTrapReason.has_value()); +} + +TEST(DemangleTrapReasonInDebugInfo, Valid) { + std::string FuncName(ClangTrapPrefix); + FuncName += "$trap category$trap message"; + CheckValidCommon(FuncName, "trap category", "trap message"); +} + +TEST(DemangleTrapReasonInDebugInfo, ValidEmptyCategory) { + std::string FuncName(ClangTrapPrefix); + FuncName += "$$trap message"; + CheckValidCommon(FuncName, "", "trap message"); +} + +TEST(DemangleTrapReasonInDebugInfo, ValidEmptyMessage) { + std::string FuncName(ClangTrapPrefix); + FuncName += "$trap category$"; + CheckValidCommon(FuncName, "trap category", ""); +} + +TEST(DemangleTrapReasonInDebugInfo, ValidAllEmpty) { + // `__builtin_verbose_trap` actually allows this + // currently. However, we should probably disallow this in Sema because having + // an empty category and message completely defeats the point of using the + // builtin (#165981). + std::string FuncName(ClangTrapPrefix); + FuncName += "$$"; + CheckValidCommon(FuncName, "", ""); +} + +TEST(DemangleTrapReasonInDebugInfo, InvalidOnlyPrefix) { + std::string FuncName(ClangTrapPrefix); + CheckInvalidCommon(FuncName); +} + +TEST(DemangleTrapReasonInDebugInfo, Invalid) { + std::string FuncName("foo"); + CheckInvalidCommon(FuncName); +} + +TEST(DemangleTrapReasonInDebugInfo, InvalidEmpty) { CheckInvalidCommon(""); } diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index ca9e7925e5e95..24235b966399d 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -20824,6 +20824,13 @@ TEST_F(FormatTest, AlignWithLineBreaks) { " argument1,\n" " argument2);", Style); + + Style.ColumnLimit = 45; + verifyFormat("auto xxxxxxxx = foo;\n" + "auto x = whatever ? some / long -\n" + " computition / stuff\n" + " : random;", + Style); } TEST_F(FormatTest, AlignWithInitializerPeriods) { diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 0312c9dfc0665..e9fadb2dbd4ac 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -40,2843 +40,3314 @@

C++ defect report implementation status

This page tracks which C++ defect reports are implemented within Clang.

- +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status index 485a9a56267ca..3ba12e13a7354 100755 --- a/clang/www/make_cxx_dr_status +++ b/clang/www/make_cxx_dr_status @@ -10,26 +10,36 @@ output = os.path.join(clang_www_dir, 'cxx_dr_status.html') dr_test_dir = os.path.join(clang_www_dir, '../test/CXX/drs') class DR: - def __init__(self, section, issue, url, status, title): - self.section, self.issue, self.url, self.status, self.title = \ - section, issue, url, status, title + def __init__(self, *, section_number, section_name, section_link, number, url, status, liaison, title): + self.section_number, self.section_name, self.section_link, self.number, self.url, self.status, self.liaison, self.title = \ + section_number, section_name, section_link, number, url, status, liaison, title def __repr__(self): - return '%s (%s): %s' % (self.issue, self.status, self.title) - -def parse(dr): - try: - section, issue_link, status, liaison, title = [ - col.split('>', 1)[1].split('')[0] - for col in dr.split('', 1)[0].split('', 1)[1].split('<', 1)[0]) - title = title.replace('', '').replace('', '').replace('\r\n', '\n').strip() - return DR(section, issue, url, status, title) + return '%s (%s): %s' % (self.number, self.status, self.title) + + pattern = re.compile(''' +(?P.*) (?P.*) + +(?P.*) +(?P.*) +(?P.*) +(?P[\\w\\W]*)</issue_title></TD> +</TR>''') + + @classmethod + def parse_from_html(cls, html_string): + match = cls.pattern.match(html_string) + if match is None: + print(f"Parse error: {html_string}", file=sys.stderr) + exit(1) + return cls( + section_number=match.group('section_number'), + section_name=match.group('section_name'), + section_link=match.group('section_link'), + number=int(match.group('number')), + url=match.group('url'), + status=match.group('status'), + liaison=match.group('liaison'), + title=match.group('title').replace('\n', ' ').strip()) def collect_tests(): status_re = re.compile(r'\bcwg([0-9]+): (.*)') @@ -68,8 +78,8 @@ def get_issues(path): print(ex, file=sys.stderr) sys.exit(1) - return sorted((parse(dr) for dr in buffer.split('<TR>')[2:]), - key = lambda dr: dr.issue) + return sorted((DR.parse_from_html(dr) for dr in buffer.split('<TR>')[2:]), + key = lambda dr: dr.number) issue_list_path = None @@ -127,9 +137,10 @@ out_html.append('''\ <p>This page tracks which C++ defect reports are implemented within Clang.</p> -<table width="689" border="1" cellspacing="0"> +<table width="892" border="1" cellspacing="0"> <tr> <th>Number</th> + <th>Section</th> <th>Status</th> <th>Issue title</th> <th>Available in Clang?</th> @@ -149,7 +160,7 @@ def availability(issue): unresolved_status = unresolved_status_match.group(1) proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready|ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status) if proposed_resolution_match is None: - raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status)) + raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.number, unresolved_status)) proposed_resolution = proposed_resolution_match.group(2) status = status[:-1-len(proposed_resolution)] status = status[:-1-len(unresolved_status)] @@ -236,7 +247,7 @@ def availability(issue): avail = 'Duplicate of <a href="#%s">%s</a>' % (dup, dup) _, avail_style, _, _ = availability(dup) else: - raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.issue)) + raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.number)) return (avail + avail_suffix, avail_style, unresolved_status, details) count = {} @@ -254,7 +265,7 @@ for dr in drs: elif dr.status in ('open', 'drafting', 'review', 'tentatively ready', 'ready'): row_style = ' class="open"' try: - avail, avail_style, unresolved_status, details = availability(dr.issue) + avail, avail_style, unresolved_status, details = availability(dr.number) except AvailabilityError as e: availability_error_occurred = True print(e.args[0]) @@ -267,12 +278,12 @@ for dr in drs: if unresolved_status != dr.status: availability_error_occurred = True print("error: issue %s is marked '%s', which differs from CWG index status '%s'" \ - % (dr.issue, unresolved_status, dr.status)) + % (dr.number, unresolved_status, dr.status)) continue else: row_style = '' try: - avail, avail_style, unresolved_status, details = availability(dr.issue) + avail, avail_style, unresolved_status, details = availability(dr.number) except AvailabilityError as e: availability_error_occurred = True print(e.args[0]) @@ -281,7 +292,7 @@ for dr in drs: if unresolved_status: availability_error_occurred = True print("error: issue %s is marked '%s', even though it is resolved in CWG index" \ - % (dr.issue, unresolved_status)) + % (dr.number, unresolved_status)) continue if not avail.startswith('Sup') and not avail.startswith('Dup'): @@ -297,8 +308,9 @@ for dr in drs: {details} </details>''' out_html.append(f''' - <tr{row_style} id="{dr.issue}"> - <td><a href="https://cplusplus.github.io/CWG/issues/{dr.issue}.html">{dr.issue}</a></td> + <tr{row_style} id="{dr.number}"> + <td><a href="https://cplusplus.github.io/CWG/issues/{dr.number}.html">{dr.number}</a></td> + <td>[<a href="{dr.section_link}">{dr.section_name}</a>]</td> <td>{dr.status}</td> <td>{dr.title}</td> <td{avail_style} align="center">{avail}</td> diff --git a/compiler-rt/lib/asan/scripts/asan_symbolize.py b/compiler-rt/lib/asan/scripts/asan_symbolize.py index 8ecd66c745119..091e9bcc9a796 100755 --- a/compiler-rt/lib/asan/scripts/asan_symbolize.py +++ b/compiler-rt/lib/asan/scripts/asan_symbolize.py @@ -59,6 +59,7 @@ def is_valid_arch(s): "armv7s", "armv7k", "arm64", + "arm64e", "powerpc64", "powerpc64le", "s390x", diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c index c21b2bad1d212..45b7055abf454 100644 --- a/compiler-rt/lib/builtins/cpu_model/x86.c +++ b/compiler-rt/lib/builtins/cpu_model/x86.c @@ -21,7 +21,9 @@ #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) +#if __STDC_HOSTED__ #include <assert.h> +#endif // __STDC_HOSTED__ #if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER) #include <cpuid.h> @@ -245,8 +247,8 @@ struct __processor_model { unsigned int __cpu_features[1]; } __cpu_model = {0, 0, 0, {0}}; -static_assert(sizeof(__cpu_model) == 16, - "Wrong size of __cpu_model will result in ABI break"); +_Static_assert(sizeof(__cpu_model) == 16, + "Wrong size of __cpu_model will result in ABI break"); // This code is copied from lib/Support/Host.cpp. // Changes to either file should be mirrored in the other. @@ -1200,8 +1202,8 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) { unsigned Vendor; unsigned Model, Family; unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0}; - static_assert(sizeof(Features) / sizeof(Features[0]) == 4, ""); - static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, ""); + _Static_assert(sizeof(Features) / sizeof(Features[0]) == 4, ""); + _Static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, ""); // This function needs to run just once. if (__cpu_model.__cpu_vendor) @@ -1234,9 +1236,11 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) { } else __cpu_model.__cpu_vendor = VENDOR_OTHER; +#if __STDC_HOSTED__ assert(__cpu_model.__cpu_vendor < VENDOR_MAX); assert(__cpu_model.__cpu_type < CPU_TYPE_MAX); assert(__cpu_model.__cpu_subtype < CPU_SUBTYPE_MAX); +#endif // __STDC_HOSTED__ return 0; } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h index ba85a0eb5a35e..b515b15b327d8 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h @@ -737,6 +737,7 @@ enum ModuleArch { kModuleArchARMV7S, kModuleArchARMV7K, kModuleArchARM64, + kModuleArchARM64E, kModuleArchLoongArch64, kModuleArchRISCV64, kModuleArchHexagon @@ -810,6 +811,8 @@ inline const char *ModuleArchToString(ModuleArch arch) { return "armv7k"; case kModuleArchARM64: return "arm64"; + case kModuleArchARM64E: + return "arm64e"; case kModuleArchLoongArch64: return "loongarch64"; case kModuleArchRISCV64: diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp index b0a29db908639..90c0b66f81b5b 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp @@ -960,7 +960,17 @@ static void DisableMmapExcGuardExceptions() { RTLD_DEFAULT, "task_set_exc_guard_behavior"); if (set_behavior == nullptr) return; const task_exc_guard_behavior_t task_exc_guard_none = 0; - set_behavior(mach_task_self(), task_exc_guard_none); + kern_return_t res = set_behavior(mach_task_self(), task_exc_guard_none); + if (res != KERN_SUCCESS) { + Report( + "WARN: task_set_exc_guard_behavior returned %d (%s), " + "mmap may fail unexpectedly.\n", + res, mach_error_string(res)); + if (res == KERN_DENIED) + Report( + "HINT: Check that task_set_exc_guard_behavior is allowed by " + "sandbox.\n"); + } } static void VerifyInterceptorsWorking(); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp index a9533d6fc04ca..a5ec85ae16460 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp @@ -20,18 +20,21 @@ #include <mach/mach.h> // These are not available in older macOS SDKs. -#ifndef CPU_SUBTYPE_X86_64_H -#define CPU_SUBTYPE_X86_64_H ((cpu_subtype_t)8) /* Haswell */ -#endif -#ifndef CPU_SUBTYPE_ARM_V7S -#define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t)11) /* Swift */ -#endif -#ifndef CPU_SUBTYPE_ARM_V7K -#define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t)12) -#endif -#ifndef CPU_TYPE_ARM64 -#define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64) -#endif +# ifndef CPU_SUBTYPE_X86_64_H +# define CPU_SUBTYPE_X86_64_H ((cpu_subtype_t)8) /* Haswell */ +# endif +# ifndef CPU_SUBTYPE_ARM_V7S +# define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t)11) /* Swift */ +# endif +# ifndef CPU_SUBTYPE_ARM_V7K +# define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t)12) +# endif +# ifndef CPU_TYPE_ARM64 +# define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64) +# endif +# ifndef CPU_SUBTYPE_ARM64E +# define CPU_SUBTYPE_ARM64E ((cpu_subtype_t)2) +# endif namespace __sanitizer { @@ -311,18 +314,26 @@ ModuleArch ModuleArchFromCpuType(cpu_type_t cputype, cpu_subtype_t cpusubtype) { case CPU_TYPE_I386: return kModuleArchI386; case CPU_TYPE_X86_64: - if (cpusubtype == CPU_SUBTYPE_X86_64_ALL) return kModuleArchX86_64; - if (cpusubtype == CPU_SUBTYPE_X86_64_H) return kModuleArchX86_64H; + if (cpusubtype == CPU_SUBTYPE_X86_64_ALL) + return kModuleArchX86_64; + if (cpusubtype == CPU_SUBTYPE_X86_64_H) + return kModuleArchX86_64H; CHECK(0 && "Invalid subtype of x86_64"); return kModuleArchUnknown; case CPU_TYPE_ARM: - if (cpusubtype == CPU_SUBTYPE_ARM_V6) return kModuleArchARMV6; - if (cpusubtype == CPU_SUBTYPE_ARM_V7) return kModuleArchARMV7; - if (cpusubtype == CPU_SUBTYPE_ARM_V7S) return kModuleArchARMV7S; - if (cpusubtype == CPU_SUBTYPE_ARM_V7K) return kModuleArchARMV7K; + if (cpusubtype == CPU_SUBTYPE_ARM_V6) + return kModuleArchARMV6; + if (cpusubtype == CPU_SUBTYPE_ARM_V7) + return kModuleArchARMV7; + if (cpusubtype == CPU_SUBTYPE_ARM_V7S) + return kModuleArchARMV7S; + if (cpusubtype == CPU_SUBTYPE_ARM_V7K) + return kModuleArchARMV7K; CHECK(0 && "Invalid subtype of ARM"); return kModuleArchUnknown; case CPU_TYPE_ARM64: + if (cpusubtype == CPU_SUBTYPE_ARM64E) + return kModuleArchARM64E; return kModuleArchARM64; default: CHECK(0 && "Invalid CPU type"); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp index f8d821e125b7a..7eb0c9756d64a 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp @@ -505,6 +505,13 @@ static void ChooseSymbolizerTools(IntrusiveList<SymbolizerTool> *list, } # if SANITIZER_APPLE + if (list->empty()) { + Report( + "WARN: No external symbolizers found. Symbols may be missing or " + "unreliable.\n"); + Report( + "HINT: Is PATH set? Does sandbox allow file-read of /usr/bin/atos?\n"); + } VReport(2, "Using dladdr symbolizer.\n"); list->push_back(new (*allocator) DlAddrSymbolizer()); # endif // SANITIZER_APPLE diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp index 00542b944f516..c18e5bd9f3194 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp @@ -70,7 +70,7 @@ TEST(MemoryMapping, LoadedModuleArchAndUUID) { EXPECT_EQ(arch, kModuleArchI386); } else if (SANITIZER_WORDSIZE == 64) { EXPECT_TRUE(arch == kModuleArchX86_64 || arch == kModuleArchX86_64H || - arch == kModuleArchARM64); + arch == kModuleArchARM64 || arch == kModuleArchARM64E); } const u8 *uuid = modules[i].uuid(); u8 null_uuid[kModuleUUIDSize] = {0}; diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.def b/compiler-rt/lib/scudo/standalone/allocator_config.def index 748530820cd64..0aea7b8f2fb9a 100644 --- a/compiler-rt/lib/scudo/standalone/allocator_config.def +++ b/compiler-rt/lib/scudo/standalone/allocator_config.def @@ -57,6 +57,10 @@ BASE_OPTIONAL(const bool, MaySupportMemoryTagging, false) // Disable the quarantine code. BASE_OPTIONAL(const bool, QuarantineDisabled, false) +// If set to true, malloc_usable_size returns the exact size of the allocation. +// If set to false, return the total available size in the allocation. +BASE_OPTIONAL(const bool, ExactUsableSize, true) + // PRIMARY_REQUIRED_TYPE(NAME) // // SizeClassMap to use with the Primary. diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index 329ec4596482b..ffe9554203241 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -706,19 +706,26 @@ class Allocator { if (!getChunkFromBlock(Block, &Chunk, &Header) && !getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) return; - } else { - if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) - return; + } else if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) { + return; } - if (Header.State == Chunk::State::Allocated) { - uptr TaggedChunk = Chunk; - if (allocatorSupportsMemoryTagging<AllocatorConfig>()) - TaggedChunk = untagPointer(TaggedChunk); - if (useMemoryTagging<AllocatorConfig>(Primary.Options.load())) - TaggedChunk = loadTag(Chunk); - Callback(TaggedChunk, getSize(reinterpret_cast<void *>(Chunk), &Header), - Arg); + + if (Header.State != Chunk::State::Allocated) + return; + + uptr TaggedChunk = Chunk; + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) + TaggedChunk = untagPointer(TaggedChunk); + uptr Size; + if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load()))) { + TaggedChunk = loadTag(Chunk); + Size = getSize(reinterpret_cast<void *>(Chunk), &Header); + } else if (AllocatorConfig::getExactUsableSize()) { + Size = getSize(reinterpret_cast<void *>(Chunk), &Header); + } else { + Size = getUsableSize(reinterpret_cast<void *>(Chunk), &Header); } + Callback(TaggedChunk, Size, Arg); }; Primary.iterateOverBlocks(Lambda); Secondary.iterateOverBlocks(Lambda); @@ -759,16 +766,50 @@ class Allocator { return false; } - // Return the usable size for a given chunk. Technically we lie, as we just - // report the actual size of a chunk. This is done to counteract code actively - // writing past the end of a chunk (like sqlite3) when the usable size allows - // for it, which then forces realloc to copy the usable size of a chunk as - // opposed to its actual size. + ALWAYS_INLINE uptr getUsableSize(const void *Ptr, + Chunk::UnpackedHeader *Header) { + void *BlockBegin = getBlockBegin(Ptr, Header); + if (LIKELY(Header->ClassId)) { + return SizeClassMap::getSizeByClassId(Header->ClassId) - + (reinterpret_cast<uptr>(Ptr) - reinterpret_cast<uptr>(BlockBegin)); + } + + uptr UntaggedPtr = reinterpret_cast<uptr>(Ptr); + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) { + UntaggedPtr = untagPointer(UntaggedPtr); + BlockBegin = untagPointer(BlockBegin); + } + return SecondaryT::getBlockEnd(BlockBegin) - UntaggedPtr; + } + + // Return the usable size for a given chunk. If MTE is enabled or if the + // ExactUsableSize config parameter is true, we report the exact size of + // the original allocation size. Otherwise, we will return the total + // actual usable size. uptr getUsableSize(const void *Ptr) { if (UNLIKELY(!Ptr)) return 0; - return getAllocSize(Ptr); + if (AllocatorConfig::getExactUsableSize() || + UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load()))) + return getAllocSize(Ptr); + + initThreadMaybe(); + +#ifdef GWP_ASAN_HOOKS + if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr))) + return GuardedAlloc.getSize(Ptr); +#endif // GWP_ASAN_HOOKS + + Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr)); + Chunk::UnpackedHeader Header; + Chunk::loadHeader(Cookie, Ptr, &Header); + + // Getting the alloc size of a chunk only makes sense if it's allocated. + if (UNLIKELY(Header.State != Chunk::State::Allocated)) + reportInvalidChunkState(AllocatorAction::Sizing, Ptr); + + return getUsableSize(Ptr, &Header); } uptr getAllocSize(const void *Ptr) { @@ -951,6 +992,19 @@ class Allocator { MemorySize, 2, 16); } + uptr getBlockBeginTestOnly(const void *Ptr) { + Chunk::UnpackedHeader Header; + Chunk::loadHeader(Cookie, Ptr, &Header); + DCHECK(Header.State == Chunk::State::Allocated); + + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) + Ptr = untagPointer(const_cast<void *>(Ptr)); + void *Begin = getBlockBegin(Ptr, &Header); + if (allocatorSupportsMemoryTagging<AllocatorConfig>()) + Begin = untagPointer(Begin); + return reinterpret_cast<uptr>(Begin); + } + private: typedef typename PrimaryT::SizeClassMap SizeClassMap; diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index 5fdfd1e7c55cc..4837ac96b9b26 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -1152,6 +1152,248 @@ TEST(ScudoCombinedTest, QuarantineDisabled) { EXPECT_EQ(Stats.find("Stats: Quarantine"), std::string::npos); } +struct UsableSizeClassConfig { + static const scudo::uptr NumBits = 1; + static const scudo::uptr MinSizeLog = 10; + static const scudo::uptr MidSizeLog = 10; + static const scudo::uptr MaxSizeLog = 13; + static const scudo::u16 MaxNumCachedHint = 8; + static const scudo::uptr MaxBytesCachedLog = 12; + static const scudo::uptr SizeDelta = 0; +}; + +struct TestExactUsableSizeConfig { + static const bool MaySupportMemoryTagging = false; + static const bool QuarantineDisabled = true; + + template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>; + + struct Primary { + // In order to properly test the usable size, this Primary config has + // four real size classes: 1024, 2048, 4096, 8192. + using SizeClassMap = scudo::FixedSizeClassMap<UsableSizeClassConfig>; + static const scudo::uptr RegionSizeLog = 21U; + static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN; + static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX; + typedef scudo::uptr CompactPtrT; + static const scudo::uptr CompactPtrScale = 0; + static const bool EnableRandomOffset = true; + static const scudo::uptr MapSizeIncrement = 1UL << 18; + static const scudo::uptr GroupSizeLog = 18; + }; + template <typename Config> + using PrimaryT = scudo::SizeClassAllocator64<Config>; + + struct Secondary { + template <typename Config> + using CacheT = scudo::MapAllocatorNoCache<Config>; + }; + + template <typename Config> using SecondaryT = scudo::MapAllocator<Config>; +}; + +template <class AllocatorT> void VerifyExactUsableSize(AllocatorT &Allocator) { + // Scan through all sizes up to 10000 then some larger sizes. + for (scudo::uptr Size = 1; Size < 10000; Size++) { + void *P = Allocator.allocate(Size, Origin); + EXPECT_EQ(Size, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << Size; + Allocator.deallocate(P, Origin); + } + + // Verify that aligned allocations also return the exact size allocated. + const scudo::uptr AllocSize = 313; + for (scudo::uptr Align = 1; Align <= 8; Align++) { + void *P = Allocator.allocate(AllocSize, Origin, 1U << Align); + EXPECT_EQ(AllocSize, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << AllocSize << " at align " + << 1 << Align; + Allocator.deallocate(P, Origin); + } + + // Verify an explicitly large allocations. + const scudo::uptr LargeAllocSize = 1000000; + void *P = Allocator.allocate(LargeAllocSize, Origin); + EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P)); + Allocator.deallocate(P, Origin); + + // Now do it for aligned allocations for large allocations. + for (scudo::uptr Align = 1; Align <= 8; Align++) { + void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Align); + EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << AllocSize << " at align " + << 1 << Align; + Allocator.deallocate(P, Origin); + } +} + +template <class AllocatorT> +void VerifyIterateOverUsableSize(AllocatorT &Allocator) { + // This will not verify if the size is the exact size or the size of the + // size class. Instead verify that the size matches the usable size and + // assume the other tests have verified getUsableSize. + std::unordered_map<void *, size_t> Pointers; + Pointers.insert({Allocator.allocate(128, Origin), 0U}); + Pointers.insert({Allocator.allocate(128, Origin, 32), 0U}); + Pointers.insert({Allocator.allocate(2000, Origin), 0U}); + Pointers.insert({Allocator.allocate(2000, Origin, 64), 0U}); + Pointers.insert({Allocator.allocate(8000, Origin), 0U}); + Pointers.insert({Allocator.allocate(8000, Origin, 128), 0U}); + Pointers.insert({Allocator.allocate(2000205, Origin), 0U}); + Pointers.insert({Allocator.allocate(2000205, Origin, 128), 0U}); + Pointers.insert({Allocator.allocate(2000205, Origin, 256), 0U}); + + Allocator.disable(); + Allocator.iterateOverChunks( + 0, static_cast<scudo::uptr>(SCUDO_MMAP_RANGE_SIZE - 1), + [](uintptr_t Base, size_t Size, void *Arg) { + std::unordered_map<void *, size_t> *Pointers = + reinterpret_cast<std::unordered_map<void *, size_t> *>(Arg); + (*Pointers)[reinterpret_cast<void *>(Base)] = Size; + }, + reinterpret_cast<void *>(&Pointers)); + Allocator.enable(); + + for (auto [Ptr, IterateSize] : Pointers) { + EXPECT_NE(0U, IterateSize) + << "Pointer " << Ptr << " not found in iterateOverChunks call."; + EXPECT_EQ(IterateSize, Allocator.getUsableSize(Ptr)) + << "Pointer " << Ptr + << " mismatch between iterate size and usable size."; + Allocator.deallocate(Ptr, Origin); + } +} + +TEST(ScudoCombinedTest, ExactUsableSize) { + using AllocatorT = scudo::Allocator<TestExactUsableSizeConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + VerifyExactUsableSize<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} + +struct TestExactUsableSizeMTEConfig : TestExactUsableSizeConfig { + static const bool MaySupportMemoryTagging = true; +}; + +TEST(ScudoCombinedTest, ExactUsableSizeMTE) { + if (!scudo::archSupportsMemoryTagging() || + !scudo::systemDetectsMemoryTagFaultsTestOnly()) + TEST_SKIP("Only supported on systems that can enable MTE."); + + scudo::enableSystemMemoryTaggingTestOnly(); + + using AllocatorT = scudo::Allocator<TestExactUsableSizeMTEConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + VerifyExactUsableSize<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} + +template <class AllocatorT> +void VerifyUsableSizePrimary(AllocatorT &Allocator) { + std::vector<scudo::uptr> SizeClasses = {1024U, 2048U, 4096U, 8192U}; + for (size_t I = 0; I < SizeClasses.size(); I++) { + scudo::uptr SizeClass = SizeClasses[I]; + scudo::uptr StartSize; + if (I == 0) + StartSize = 1; + else + StartSize = SizeClasses[I - 1]; + scudo::uptr UsableSize = SizeClass - scudo::Chunk::getHeaderSize(); + for (scudo::uptr Size = StartSize; Size < UsableSize; Size++) { + void *P = Allocator.allocate(Size, Origin); + EXPECT_EQ(UsableSize, Allocator.getUsableSize(P)) + << "Failed usable size at allocation size " << Size + << " for size class " << SizeClass; + memset(P, 0xff, UsableSize); + EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass, + reinterpret_cast<scudo::uptr>(P) + UsableSize); + Allocator.deallocate(P, Origin); + } + + StartSize = UsableSize + 1; + } + + std::vector<scudo::uptr> Alignments = {32U, 128U}; + for (size_t I = 0; I < SizeClasses.size(); I++) { + scudo::uptr SizeClass = SizeClasses[I]; + scudo::uptr AllocSize; + if (I == 0) + AllocSize = 1; + else + AllocSize = SizeClasses[I - 1] + 1; + + for (auto Alignment : Alignments) { + void *P = Allocator.allocate(AllocSize, Origin, Alignment); + scudo::uptr UsableSize = Allocator.getUsableSize(P); + memset(P, 0xff, UsableSize); + EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass, + reinterpret_cast<scudo::uptr>(P) + UsableSize) + << "Failed usable size at allocation size " << AllocSize + << " for size class " << SizeClass << " at alignment " << Alignment; + Allocator.deallocate(P, Origin); + } + } +} + +template <class AllocatorT> +void VerifyUsableSizeSecondary(AllocatorT &Allocator) { + const scudo::uptr LargeAllocSize = 996780; + const scudo::uptr PageSize = scudo::getPageSizeCached(); + void *P = Allocator.allocate(LargeAllocSize, Origin); + scudo::uptr UsableSize = Allocator.getUsableSize(P); + memset(P, 0xff, UsableSize); + // Assumes that the secondary always rounds up allocations to a page boundary. + EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize, + PageSize), + reinterpret_cast<scudo::uptr>(P) + UsableSize); + Allocator.deallocate(P, Origin); + + // Check aligned allocations now. + for (scudo::uptr Alignment = 1; Alignment <= 8; Alignment++) { + void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Alignment); + scudo::uptr UsableSize = Allocator.getUsableSize(P); + EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize, + PageSize), + reinterpret_cast<scudo::uptr>(P) + UsableSize) + << "Failed usable size at allocation size " << LargeAllocSize + << " at alignment " << Alignment; + Allocator.deallocate(P, Origin); + } +} + +struct TestFullUsableSizeConfig : TestExactUsableSizeConfig { + static const bool ExactUsableSize = false; +}; + +TEST(ScudoCombinedTest, FullUsableSize) { + using AllocatorT = scudo::Allocator<TestFullUsableSizeConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + VerifyUsableSizePrimary<AllocatorT>(*Allocator); + VerifyUsableSizeSecondary<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} + +struct TestFullUsableSizeMTEConfig : TestFullUsableSizeConfig { + static const bool MaySupportMemoryTagging = true; +}; + +TEST(ScudoCombinedTest, FullUsableSizeMTE) { + if (!scudo::archSupportsMemoryTagging() || + !scudo::systemDetectsMemoryTagFaultsTestOnly()) + TEST_SKIP("Only supported on systems that can enable MTE."); + + scudo::enableSystemMemoryTaggingTestOnly(); + + using AllocatorT = scudo::Allocator<TestFullUsableSizeMTEConfig>; + auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT()); + + // When MTE is enabled, you get exact sizes. + VerifyExactUsableSize<AllocatorT>(*Allocator); + VerifyIterateOverUsableSize<AllocatorT>(*Allocator); +} // Verify that no special quarantine blocks appear in iterateOverChunks. TEST(ScudoCombinedTest, QuarantineIterateOverChunks) { using AllocatorT = TestAllocator<TestQuarantineConfig>; diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp index 612317b3c3293..9e5d0658e5ed5 100644 --- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp @@ -588,8 +588,13 @@ TEST_F(ScudoWrappersCTest, MallocInfo) { EXPECT_EQ(errno, 0); fclose(F); EXPECT_EQ(strncmp(Buffer, "<malloc version=\"scudo-", 23), 0); - EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"1234\" count=\"")); - EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"4321\" count=\"")); + std::string expected; + expected = + "<alloc size=\"" + std::to_string(malloc_usable_size(P1)) + "\" count=\""; + EXPECT_NE(nullptr, strstr(Buffer, expected.c_str())); + expected = + "<alloc size=\"" + std::to_string(malloc_usable_size(P2)) + "\" count=\""; + EXPECT_NE(nullptr, strstr(Buffer, expected.c_str())); free(P1); free(P2); diff --git a/compiler-rt/test/asan/TestCases/strcmp.c b/compiler-rt/test/asan/TestCases/strcmp.c index 417bd491ebe02..2b31e64768c42 100644 --- a/compiler-rt/test/asan/TestCases/strcmp.c +++ b/compiler-rt/test/asan/TestCases/strcmp.c @@ -14,6 +14,8 @@ int main(int argc, char **argv) { assert(strcmp(s1 - 1, s2)); // CHECK: {{.*ERROR: AddressSanitizer: stack-buffer-underflow on address}} - // CHECK: READ of size 1 + // Very rarely `s1[-1]` happens to be '1', resulting in `strcmp` needing to + // check 2 bytes before failing, rather than 1 - this should still pass + // CHECK: READ of size {{[12]}} return 0; } diff --git a/compiler-rt/test/tsan/cxa_guard_acquire.cpp b/compiler-rt/test/tsan/cxa_guard_acquire.cpp index fc407259e8968..6050c243cb8c1 100644 --- a/compiler-rt/test/tsan/cxa_guard_acquire.cpp +++ b/compiler-rt/test/tsan/cxa_guard_acquire.cpp @@ -66,10 +66,17 @@ int main(int argc, char **argv) { printf("Enter main\n"); // If initialization is contended, the blocked thread should enter a - // potentially blocking region. + // potentially blocking region. Note that we use a DAG check because it is + // possible for Thread 1 to acquire the guard, then Thread 2 fail to acquire + // the guard then call `OnPotentiallyBlockingRegionBegin` and print "Enter + // potentially blocking region\n", before Thread 1 manages to reach "Enter + // constructor\n". This is exceptionally rare, but can be replicated by + // inserting a `sleep(1)` between `LazyInit() {` and `printf("Enter + // constructor\n");`. Due to the barrier it is not possible for the exit logs + // to be inverted. // - // CHECK-NEXT: Enter constructor - // CHECK-NEXT: Enter potentially blocking region + // CHECK-DAG: Enter constructor + // CHECK-DAG: Enter potentially blocking region // CHECK-NEXT: Exit constructor // CHECK-NEXT: Exit potentially blocking region barrier_init(&barrier, 2); diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp index 225a6558ef956..ef58da61e371b 100644 --- a/flang/examples/FeatureList/FeatureList.cpp +++ b/flang/examples/FeatureList/FeatureList.cpp @@ -445,6 +445,7 @@ struct NodeVisitor { READ_FEATURE(ObjectDecl) READ_FEATURE(OldParameterStmt) READ_FEATURE(OmpAlignedClause) + READ_FEATURE(OmpAllocateDirective) READ_FEATURE(OmpBeginDirective) READ_FEATURE(OmpBeginLoopDirective) READ_FEATURE(OmpBeginSectionsDirective) @@ -541,7 +542,6 @@ struct NodeVisitor { READ_FEATURE(OpenMPCancellationPointConstruct) READ_FEATURE(OpenMPConstruct) READ_FEATURE(OpenMPCriticalConstruct) - READ_FEATURE(OpenMPDeclarativeAllocate) READ_FEATURE(OpenMPDeclarativeConstruct) READ_FEATURE(OpenMPDeclareReductionConstruct) READ_FEATURE(OpenMPDeclareSimdConstruct) @@ -550,7 +550,6 @@ struct NodeVisitor { READ_FEATURE(OmpAtomicDefaultMemOrderClause) READ_FEATURE(OpenMPFlushConstruct) READ_FEATURE(OpenMPLoopConstruct) - READ_FEATURE(OpenMPExecutableAllocate) READ_FEATURE(OpenMPAllocatorsConstruct) READ_FEATURE(OpenMPRequiresConstruct) READ_FEATURE(OpenMPSimpleStandaloneConstruct) diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h new file mode 100644 index 0000000000000..d735ce95a83dc --- /dev/null +++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h @@ -0,0 +1,95 @@ +//==-- Builder/CUDAIntrinsicCall.h - lowering of CUDA intrinsics ---*-C++-*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_LOWER_CUDAINTRINSICCALL_H +#define FORTRAN_LOWER_CUDAINTRINSICCALL_H + +#include "flang/Optimizer/Builder/IntrinsicCall.h" +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" + +namespace fir { + +struct CUDAIntrinsicLibrary : IntrinsicLibrary { + + // Constructors. + explicit CUDAIntrinsicLibrary(fir::FirOpBuilder &builder, mlir::Location loc) + : IntrinsicLibrary(builder, loc) {} + CUDAIntrinsicLibrary() = delete; + CUDAIntrinsicLibrary(const CUDAIntrinsicLibrary &) = delete; + + // CUDA intrinsic handlers. + mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicAddR2(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + template <int extent> + fir::ExtendedValue genAtomicAddVector(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicCas(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicExch(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>); + fir::ExtendedValue genAtomicXor(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>); + template <const char *fctName, int extent> + fir::ExtendedValue genLDXXFunc(mlir::Type, + llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>); + template <typename OpTy> + mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>); + mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>); + mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>); + void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>); + void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>); + void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>); + void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>); + template <mlir::NVVM::VoteSyncKind kind> + mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>); +}; + +const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name); + +} // namespace fir + +#endif // FORTRAN_LOWER_CUDAINTRINSICCALL_H diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 3407dd01dd504..01d27fd5fc399 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -19,7 +19,6 @@ #include "flang/Runtime/iostat-consts.h" #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/Math/IR/Math.h" #include <optional> @@ -187,20 +186,6 @@ struct IntrinsicLibrary { mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); - fir::ExtendedValue genAtomicCas(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>); - fir::ExtendedValue genAtomicExch(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>); - fir::ExtendedValue genAtomicXor(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>); @@ -208,11 +193,6 @@ struct IntrinsicLibrary { fir::ExtendedValue genAssociated(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genAtand(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genBesselJn(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genBesselYn(mlir::Type, @@ -234,9 +214,6 @@ struct IntrinsicLibrary { fir::ExtendedValue genCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); void genCpuTime(llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCshift(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - template <const char *fctName, int extent> - fir::ExtendedValue genCUDALDXXFunc(mlir::Type, - llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCAssociatedCFunPtr(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genCAssociatedCPtr(mlir::Type, @@ -276,7 +253,6 @@ struct IntrinsicLibrary { llvm::ArrayRef<fir::ExtendedValue>); template <Extremum, ExtremumBehavior> mlir::Value genExtremum(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genFraction(mlir::Type resultType, mlir::ArrayRef<mlir::Value> args); @@ -368,8 +344,6 @@ struct IntrinsicLibrary { mlir::Value genMalloc(mlir::Type, llvm::ArrayRef<mlir::Value>); template <typename Shift> mlir::Value genMask(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genMatmul(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genMatmulTranspose(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); @@ -392,8 +366,6 @@ struct IntrinsicLibrary { fir::ExtendedValue genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genNumImages(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - template <typename OpTy> - mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genPack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genParity(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); void genPerror(llvm::ArrayRef<fir::ExtendedValue>); @@ -448,56 +420,25 @@ struct IntrinsicLibrary { fir::ExtendedValue genSum(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); void genSignalSubroutine(llvm::ArrayRef<fir::ExtendedValue>); void genSleep(llvm::ArrayRef<fir::ExtendedValue>); - void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genSystem(std::optional<mlir::Type>, mlir::ArrayRef<fir::ExtendedValue> args); void genSystemClock(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genTand(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genTanpi(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>); - void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>); mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genTransfer(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genTranspose(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>); fir::ExtendedValue genThisImage(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>); - mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>); - void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>); - void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>); - void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genTrim(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genUbound(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genUnlink(std::optional<mlir::Type> resultType, llvm::ArrayRef<fir::ExtendedValue> args); fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); fir::ExtendedValue genVerify(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>); - template <mlir::NVVM::VoteSyncKind kind> - mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>); /// Implement all conversion functions like DBLE, the first argument is /// the value to convert. There may be an additional KIND arguments that diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index a7398a4ef970f..de2716410d6cd 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -512,6 +512,7 @@ class ParseTreeDumper { NODE(parser, OmpAlignModifier) NODE(parser, OmpAllocateClause) NODE(OmpAllocateClause, Modifier) + NODE(parser, OmpAllocateDirective) NODE(parser, OmpAllocatorComplexModifier) NODE(parser, OmpAllocatorSimpleModifier) NODE(parser, OmpAlwaysModifier) @@ -739,7 +740,6 @@ class ParseTreeDumper { NODE(parser, OpenMPCancellationPointConstruct) NODE(parser, OpenMPConstruct) NODE(parser, OpenMPCriticalConstruct) - NODE(parser, OpenMPDeclarativeAllocate) NODE(parser, OpenMPDeclarativeAssumes) NODE(parser, OpenMPDeclarativeConstruct) NODE(parser, OpenMPDeclareMapperConstruct) @@ -748,7 +748,6 @@ class ParseTreeDumper { NODE(parser, OpenMPDeclareTargetConstruct) NODE(parser, OpenMPDepobjConstruct) NODE(parser, OpenMPDispatchConstruct) - NODE(parser, OpenMPExecutableAllocate) NODE(parser, OpenMPFlushConstruct) NODE(parser, OpenMPGroupprivate) NODE(parser, OpenMPLoopConstruct) diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h index 49db091af93a7..8fa4a84aff06d 100644 --- a/flang/include/flang/Parser/openmp-utils.h +++ b/flang/include/flang/Parser/openmp-utils.h @@ -22,6 +22,7 @@ #include <type_traits> #include <utility> #include <variant> +#include <vector> namespace Fortran::parser::omp { @@ -33,23 +34,6 @@ template <typename T> constexpr auto addr_if(const std::optional<T> &x) { } namespace detail { -using D = llvm::omp::Directive; - -template <typename Construct> // -struct ConstructId { - static constexpr llvm::omp::Directive id{D::OMPD_unknown}; -}; - -#define MAKE_CONSTR_ID(Construct, Id) \ - template <> struct ConstructId<Construct> { \ - static constexpr llvm::omp::Directive id{Id}; \ - } - -MAKE_CONSTR_ID(OpenMPDeclarativeAllocate, D::OMPD_allocate); -MAKE_CONSTR_ID(OpenMPExecutableAllocate, D::OMPD_allocate); - -#undef MAKE_CONSTR_ID - struct DirectiveNameScope { static OmpDirectiveName MakeName(CharBlock source = {}, llvm::omp::Directive id = llvm::omp::Directive::OMPD_unknown) { @@ -97,9 +81,6 @@ struct DirectiveNameScope { } else if constexpr (TupleTrait<T>) { if constexpr (std::is_base_of_v<OmpBlockConstruct, T>) { return std::get<OmpBeginDirective>(x.t).DirName(); - } else if constexpr (std::is_same_v<T, OpenMPDeclarativeAllocate> || - std::is_same_v<T, OpenMPExecutableAllocate>) { - return MakeName(std::get<Verbatim>(x.t).source, ConstructId<T>::id); } else { return GetFromTuple( x.t, std::make_index_sequence<std::tuple_size_v<decltype(x.t)>>{}); @@ -139,6 +120,9 @@ template <typename T> OmpDirectiveName GetOmpDirectiveName(const T &x) { return detail::DirectiveNameScope::GetOmpDirectiveName(x); } +const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x); +const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x); + const OmpObjectList *GetOmpObjectList(const OmpClause &clause); template <typename T> @@ -158,6 +142,13 @@ const OmpCombinerExpression *GetCombinerExpr( const OmpReductionSpecifier &rspec); const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init); +struct OmpAllocateInfo { + std::vector<const OmpAllocateDirective *> dirs; + const ExecutionPartConstruct *body{nullptr}; +}; + +OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x); + } // namespace Fortran::parser::omp #endif // FORTRAN_PARSER_OPENMP_UTILS_H diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index 4dd5e84f60dfe..8c7578f7a1941 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -5151,17 +5151,42 @@ struct OpenMPThreadprivate { CharBlock source; }; -// 2.11.3 allocate -> ALLOCATE (variable-name-list) [clause] -struct OpenMPDeclarativeAllocate { - TUPLE_CLASS_BOILERPLATE(OpenMPDeclarativeAllocate); - CharBlock source; - std::tuple<Verbatim, std::optional<OmpObjectList>, OmpClauseList> t; +// Ref: [4.5:310-312], [5.0:156-158], [5.1:181-184], [5.2:176-177], +// [6.0:310-312] +// +// allocate-directive -> +// ALLOCATE (variable-list-item...) | // since 4.5 +// ALLOCATE (variable-list-item...) // since 5.0, until 5.1 +// ... +// allocate-stmt +// +// The first form is the "declarative-allocate", and is a declarative +// directive. The second is the "executable-allocate" and is an executable +// directive. The executable form was deprecated in 5.2. +// +// The executable-allocate consists of several ALLOCATE directives. Since +// in the parse tree every type corresponding to a directive only corresponds +// to a single directive, the executable form is represented by a sequence +// of nested OmpAlocateDirectives, e.g. +// !$OMP ALLOCATE(x) +// !$OMP ALLOCATE(y) +// ALLOCATE(x, y) +// will become +// OmpAllocateDirective +// |- ALLOCATE(x) // begin directive +// `- OmpAllocateDirective // block +// |- ALLOCATE(y) // begin directive +// `- ALLOCATE(x, y) // block +// +// The block in the declarative-allocate will be empty. +struct OmpAllocateDirective : public OmpBlockConstruct { + INHERITED_TUPLE_CLASS_BOILERPLATE(OmpAllocateDirective, OmpBlockConstruct); }; struct OpenMPDeclarativeConstruct { UNION_CLASS_BOILERPLATE(OpenMPDeclarativeConstruct); CharBlock source; - std::variant<OpenMPDeclarativeAllocate, OpenMPDeclarativeAssumes, + std::variant<OmpAllocateDirective, OpenMPDeclarativeAssumes, OpenMPDeclareMapperConstruct, OpenMPDeclareReductionConstruct, OpenMPDeclareSimdConstruct, OpenMPDeclareTargetConstruct, OmpDeclareVariantDirective, OpenMPGroupprivate, OpenMPThreadprivate, @@ -5174,19 +5199,6 @@ struct OpenMPCriticalConstruct : public OmpBlockConstruct { INHERITED_TUPLE_CLASS_BOILERPLATE(OpenMPCriticalConstruct, OmpBlockConstruct); }; -// 2.11.3 allocate -> ALLOCATE [(variable-name-list)] [clause] -// [ALLOCATE (variable-name-list) [clause] [...]] -// allocate-statement -// clause -> allocator-clause -struct OpenMPExecutableAllocate { - TUPLE_CLASS_BOILERPLATE(OpenMPExecutableAllocate); - CharBlock source; - std::tuple<Verbatim, std::optional<OmpObjectList>, OmpClauseList, - std::optional<std::list<OpenMPDeclarativeAllocate>>, - Statement<AllocateStmt>> - t; -}; - // Ref: [5.2:180-181], [6.0:315] // // allocators-construct -> @@ -5342,9 +5354,9 @@ struct OpenMPConstruct { UNION_CLASS_BOILERPLATE(OpenMPConstruct); std::variant<OpenMPStandaloneConstruct, OpenMPSectionsConstruct, OpenMPSectionConstruct, OpenMPLoopConstruct, OmpBlockConstruct, - OpenMPAtomicConstruct, OpenMPDeclarativeAllocate, OpenMPDispatchConstruct, - OpenMPUtilityConstruct, OpenMPExecutableAllocate, - OpenMPAllocatorsConstruct, OpenMPAssumeConstruct, OpenMPCriticalConstruct> + OpenMPAtomicConstruct, OmpAllocateDirective, OpenMPDispatchConstruct, + OpenMPUtilityConstruct, OpenMPAllocatorsConstruct, OpenMPAssumeConstruct, + OpenMPCriticalConstruct> u; }; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 71067283d13f7..ad456d89bc432 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -3503,12 +3503,12 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, lower::pft::Evaluation &eval, const parser::OpenMPUtilityConstruct &); -static void -genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - const parser::OpenMPDeclarativeAllocate &declarativeAllocate) { +static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, + const parser::OmpAllocateDirective &allocate) { if (!semaCtx.langOptions().OpenMPSimd) - TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate"); + TODO(converter.getCurrentLocation(), "OmpAllocateDirective"); } static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, @@ -3899,14 +3899,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct"); } -static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, - const parser::OpenMPExecutableAllocate &execAllocConstruct) { - if (!semaCtx.langOptions().OpenMPSimd) - TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate"); -} - static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt index 1f95259a857da..37c9c2d703c76 100644 --- a/flang/lib/Optimizer/Builder/CMakeLists.txt +++ b/flang/lib/Optimizer/Builder/CMakeLists.txt @@ -5,6 +5,7 @@ add_flang_library(FIRBuilder BoxValue.cpp Character.cpp Complex.cpp + CUDAIntrinsicCall.cpp CUFCommon.cpp DoLoopHelper.cpp FIRBuilder.cpp diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp new file mode 100644 index 0000000000000..6312e61f5e62a --- /dev/null +++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp @@ -0,0 +1,1587 @@ +//===-- CUDAIntrinsicCall.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Helper routines for constructing the FIR dialect of MLIR for PowerPC +// intrinsics. Extensive use of MLIR interfaces and MLIR's coding style +// (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this +// module. +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h" +#include "flang/Evaluate/common.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/MutableBox.h" +#include "mlir/Dialect/Index/IR/IndexOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" + +namespace fir { + +using CI = CUDAIntrinsicLibrary; + +static const char __ldca_i4x4[] = "__ldca_i4x4_"; +static const char __ldca_i8x2[] = "__ldca_i8x2_"; +static const char __ldca_r2x2[] = "__ldca_r2x2_"; +static const char __ldca_r4x4[] = "__ldca_r4x4_"; +static const char __ldca_r8x2[] = "__ldca_r8x2_"; +static const char __ldcg_i4x4[] = "__ldcg_i4x4_"; +static const char __ldcg_i8x2[] = "__ldcg_i8x2_"; +static const char __ldcg_r2x2[] = "__ldcg_r2x2_"; +static const char __ldcg_r4x4[] = "__ldcg_r4x4_"; +static const char __ldcg_r8x2[] = "__ldcg_r8x2_"; +static const char __ldcs_i4x4[] = "__ldcs_i4x4_"; +static const char __ldcs_i8x2[] = "__ldcs_i8x2_"; +static const char __ldcs_r2x2[] = "__ldcs_r2x2_"; +static const char __ldcs_r4x4[] = "__ldcs_r4x4_"; +static const char __ldcs_r8x2[] = "__ldcs_r8x2_"; +static const char __ldcv_i4x4[] = "__ldcv_i4x4_"; +static const char __ldcv_i8x2[] = "__ldcv_i8x2_"; +static const char __ldcv_r2x2[] = "__ldcv_r2x2_"; +static const char __ldcv_r4x4[] = "__ldcv_r4x4_"; +static const char __ldcv_r8x2[] = "__ldcv_r8x2_"; +static const char __ldlu_i4x4[] = "__ldlu_i4x4_"; +static const char __ldlu_i8x2[] = "__ldlu_i8x2_"; +static const char __ldlu_r2x2[] = "__ldlu_r2x2_"; +static const char __ldlu_r4x4[] = "__ldlu_r4x4_"; +static const char __ldlu_r8x2[] = "__ldlu_r8x2_"; + +// CUDA specific intrinsic handlers. +static constexpr IntrinsicHandler cudaHandlers[]{ + {"__ldca_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldca_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldca_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcg_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcg_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcs_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcs_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldcv_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldcv_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_i4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_i4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_i8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_i8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_r2x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_r4x4, 4>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"__ldlu_r8x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genLDXXFunc<__ldlu_r8x2, 2>), + {{{"a", asAddr}}}, + /*isElemental=*/false}, + {"all_sync", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genVoteSync<mlir::NVVM::VoteSyncKind::all>), + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, + {"any_sync", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genVoteSync<mlir::NVVM::VoteSyncKind::any>), + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, + {"atomicadd_r4x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<2>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicadd_r4x4", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<4>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicaddd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicaddr2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicAddR2), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicaddvector_r2x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<2>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicaddvector_r4x2", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>( + &CI::genAtomicAddVector<2>), + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicandi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAnd), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomiccasd", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomiccasf", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomiccasi", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomiccasul", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas), + {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, + false}, + {"atomicdeci", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicDec), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchd", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchf", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchi", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicexchul", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicinci", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicInc), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmaxl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmind", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicminf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicmini", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicminl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicori", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicOr), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubi", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicsubl", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"atomicxori", + static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicXor), + {{{"a", asAddr}, {"v", asValue}}}, + false}, + {"ballot_sync", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>), + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, + {"barrier_arrive", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierArrive), + {{{"barrier", asAddr}}}, + /*isElemental=*/false}, + {"barrier_arrive_cnt", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierArriveCnt), + {{{"barrier", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"barrier_init", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genBarrierInit), + {{{"barrier", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"barrier_try_wait", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierTryWait), + {{{"barrier", asAddr}, {"token", asValue}}}, + /*isElemental=*/false}, + {"barrier_try_wait_sleep", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genBarrierTryWaitSleep), + {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}}, + /*isElemental=*/false}, + {"clock", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genNVVMTime<mlir::NVVM::ClockOp>), + {}, + /*isElemental=*/false}, + {"clock64", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genNVVMTime<mlir::NVVM::Clock64Op>), + {}, + /*isElemental=*/false}, + {"fence_proxy_async", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genFenceProxyAsync), + {}, + /*isElemental=*/false}, + {"globaltimer", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genNVVMTime<mlir::NVVM::GlobalTimerOp>), + {}, + /*isElemental=*/false}, + {"match_all_syncjd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_all_syncjf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_all_syncjj", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_all_syncjx", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAllSync), + {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, + /*isElemental=*/false}, + {"match_any_syncjd", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"match_any_syncjf", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"match_any_syncjj", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"match_any_syncjx", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genMatchAnySync), + {{{"mask", asValue}, {"value", asValue}}}, + /*isElemental=*/false}, + {"syncthreads", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genSyncThreads), + {}, + /*isElemental=*/false}, + {"syncthreads_and_i4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsAnd), + {}, + /*isElemental=*/false}, + {"syncthreads_and_l4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsAnd), + {}, + /*isElemental=*/false}, + {"syncthreads_count_i4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsCount), + {}, + /*isElemental=*/false}, + {"syncthreads_count_l4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsCount), + {}, + /*isElemental=*/false}, + {"syncthreads_or_i4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsOr), + {}, + /*isElemental=*/false}, + {"syncthreads_or_l4", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genSyncThreadsOr), + {}, + /*isElemental=*/false}, + {"syncwarp", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp), + {}, + /*isElemental=*/false}, + {"this_grid", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid), + {}, + /*isElemental=*/false}, + {"this_thread_block", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>( + &CI::genThisThreadBlock), + {}, + /*isElemental=*/false}, + {"this_warp", + static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisWarp), + {}, + /*isElemental=*/false}, + {"threadfence", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genThreadFence), + {}, + /*isElemental=*/false}, + {"threadfence_block", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genThreadFenceBlock), + {}, + /*isElemental=*/false}, + {"threadfence_system", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genThreadFenceSystem), + {}, + /*isElemental=*/false}, + {"tma_bulk_commit_group", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkCommitGroup), + {{}}, + /*isElemental=*/false}, + {"tma_bulk_g2s", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkG2S), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nbytes", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldc4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadC4), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldc8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadC8), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldi4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadI4), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldi8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadI8), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr2", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadR2), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadR4), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_ldr8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkLoadR8), + {{{"barrier", asAddr}, + {"src", asAddr}, + {"dst", asAddr}, + {"nelems", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_s2g", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkS2G), + {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_c4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreC4), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_c8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreC8), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_i4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreI4), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_i8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreI8), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r2", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreR2), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r4", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreR4), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_store_r8", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkStoreR8), + {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, + /*isElemental=*/false}, + {"tma_bulk_wait_group", + static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>( + &CI::genTMABulkWaitGroup), + {{}}, + /*isElemental=*/false}, +}; + +template <std::size_t N> +static constexpr bool isSorted(const IntrinsicHandler (&array)[N]) { + // Replace by std::sorted when C++20 is default (will be constexpr). + const IntrinsicHandler *lastSeen{nullptr}; + bool isSorted{true}; + for (const auto &x : array) { + if (lastSeen) + isSorted &= std::string_view{lastSeen->name} < std::string_view{x.name}; + lastSeen = &x; + } + return isSorted; +} +static_assert(isSorted(cudaHandlers) && "map must be sorted"); + +const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name) { + auto compare = [](const IntrinsicHandler &cudaHandler, llvm::StringRef name) { + return name.compare(cudaHandler.name) > 0; + }; + auto result = llvm::lower_bound(cudaHandlers, name, compare); + return result != std::end(cudaHandlers) && result->name == name ? result + : nullptr; +} + +static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder, + mlir::Location loc, + mlir::Value barrier, + mlir::NVVM::NVVMMemorySpace space) { + mlir::Value llvmPtr = fir::ConvertOp::create( + builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()), + barrier); + mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create( + builder, loc, + mlir::LLVM::LLVMPointerType::get(builder.getContext(), + static_cast<unsigned>(space)), + llvmPtr); + return addrCast; +} + +static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc, + mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0, + mlir::Value arg1) { + auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + arg0 = builder.createConvert(loc, llvmPointerType, arg0); + return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1, + mlir::LLVM::AtomicOrdering::seq_cst); +} + +// ATOMICADD +mlir::Value +CUDAIntrinsicLibrary::genAtomicAdd(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::add + : mlir::LLVM::AtomicBinOp::fadd; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + + mlir::Value a = fir::getBase(args[0]); + + if (mlir::isa<fir::BaseBoxType>(a.getType())) { + a = fir::BoxAddrOp::create(builder, loc, a); + } + + auto loc = builder.getUnknownLoc(); + auto f16Ty = builder.getF16Type(); + auto i32Ty = builder.getI32Type(); + auto vecF16Ty = mlir::VectorType::get({2}, f16Ty); + mlir::Type idxTy = builder.getIndexType(); + auto f16RefTy = fir::ReferenceType::get(f16Ty); + auto zero = builder.createIntegerConstant(loc, idxTy, 0); + auto one = builder.createIntegerConstant(loc, idxTy, 1); + auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, + fir::getBase(args[1]), zero); + auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy, + fir::getBase(args[1]), one); + auto v1 = fir::LoadOp::create(builder, loc, v1Coord); + auto v2 = fir::LoadOp::create(builder, loc, v2Coord); + mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty); + mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( + builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); + mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( + builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); + auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); + auto i32VecTy = mlir::VectorType::get({1}, i32Ty); + mlir::Value vecI32 = + mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res); + return mlir::vector::ExtractOp::create(builder, loc, vecI32, + mlir::ArrayRef<int64_t>{0}); +} + +// ATOMICADDVECTOR +template <int extent> +fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector( + mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value res = fir::AllocaOp::create( + builder, loc, fir::SequenceType::get({extent}, resultType)); + mlir::Value a = fir::getBase(args[0]); + if (mlir::isa<fir::BaseBoxType>(a.getType())) { + a = fir::BoxAddrOp::create(builder, loc, a); + } + auto vecTy = mlir::VectorType::get({extent}, resultType); + auto refTy = fir::ReferenceType::get(resultType); + mlir::Type i32Ty = builder.getI32Type(); + mlir::Type idxTy = builder.getIndexType(); + + // Extract the values from the array. + llvm::SmallVector<mlir::Value> values; + for (unsigned i = 0; i < extent; ++i) { + mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i); + mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy, + fir::getBase(args[1]), pos); + mlir::Value value = fir::LoadOp::create(builder, loc, coord); + values.push_back(value); + } + // Pack extracted values into a vector to call the atomic add. + mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); + for (unsigned i = 0; i < extent; ++i) { + mlir::Value insert = mlir::LLVM::InsertElementOp::create( + builder, loc, undef, values[i], + builder.createIntegerConstant(loc, i32Ty, i)); + undef = insert; + } + // Atomic operation with a vector of values. + mlir::Value add = + genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef); + // Store results in the result array. + for (unsigned i = 0; i < extent; ++i) { + mlir::Value r = mlir::LLVM::ExtractElementOp::create( + builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i)); + mlir::Value c = fir::CoordinateOp::create( + builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i)); + fir::StoreOp::create(builder, loc, r, c); + } + mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent); + return fir::ArrayBoxValue(res, {ext}); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicAnd(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicOr(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICCAS +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicCas(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel; + auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic; + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext()); + + mlir::Value arg0 = fir::getBase(args[0]); + mlir::Value arg1 = fir::getBase(args[1]); + mlir::Value arg2 = fir::getBase(args[2]); + + auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value { + if (mlir::isa<mlir::Float32Type>(arg.getType())) + return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(), + arg); + if (mlir::isa<mlir::Float64Type>(arg.getType())) + return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(), + arg); + return arg; + }; + + arg1 = bitCastFloat(arg1); + arg2 = bitCastFloat(arg2); + + if (arg1.getType() != arg2.getType()) { + // arg1 and arg2 need to have the same type in AtomicCmpXchgOp. + arg2 = builder.createConvert(loc, arg1.getType(), arg2); + } + + auto address = + mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0) + .getResult(0); + auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create( + builder, loc, address, arg1, arg2, successOrdering, failureOrdering); + mlir::Value boolResult = + mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1); + return builder.createConvert(loc, resultType, boolResult); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicDec(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICEXCH +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicExch(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value arg0 = fir::getBase(args[0]); + mlir::Value arg1 = fir::getBase(args[1]); + assert(arg1.getType().isIntOrFloat()); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg; + return genAtomBinOp(builder, loc, binOp, arg0, arg1); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicInc(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + assert(mlir::isa<mlir::IntegerType>(args[1].getType())); + + mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicMax(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::max + : mlir::LLVM::AtomicBinOp::fmax; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +mlir::Value +CUDAIntrinsicLibrary::genAtomicMin(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::min + : mlir::LLVM::AtomicBinOp::fmin; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICSUB +mlir::Value +CUDAIntrinsicLibrary::genAtomicSub(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::LLVM::AtomicBinOp binOp = + mlir::isa<mlir::IntegerType>(args[1].getType()) + ? mlir::LLVM::AtomicBinOp::sub + : mlir::LLVM::AtomicBinOp::fsub; + return genAtomBinOp(builder, loc, binOp, args[0], args[1]); +} + +// ATOMICXOR +fir::ExtendedValue +CUDAIntrinsicLibrary::genAtomicXor(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value arg0 = fir::getBase(args[0]); + mlir::Value arg1 = fir::getBase(args[1]); + return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1); +} + +// BARRIER_ARRIVE +mlir::Value +CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 1); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); + return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier) + .getResult(); +} + +// BARRIER_ARRIBVE_CNT +mlir::Value +CUDAIntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); + return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType}, + {barrier, args[1]}, {}, + "mbarrier.arrive.expect_tx.release." + "cta.shared::cta.b64 %0, [%1], %2;", + {}) + .getResult(0); +} + +// BARRIER_INIT +void CUDAIntrinsicLibrary::genBarrierInit( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 2); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); + mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier, + fir::getBase(args[1]), {}); + auto kind = mlir::NVVM::ProxyKindAttr::get( + builder.getContext(), mlir::NVVM::ProxyKind::async_shared); + auto space = mlir::NVVM::SharedSpaceAttr::get( + builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); + mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); +} + +// BARRIER_TRY_WAIT +mlir::Value +CUDAIntrinsicLibrary::genBarrierTryWait(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0); + fir::StoreOp::create(builder, loc, zero, res); + mlir::Value ns = + builder.createIntegerConstant(loc, builder.getI32Type(), 1000000); + mlir::Value load = fir::LoadOp::create(builder, loc, res); + auto whileOp = mlir::scf::WhileOp::create( + builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load}); + mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore()); + mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc); + builder.setInsertionPointToStart(beforeBlock); + mlir::Value condition = mlir::arith::CmpIOp::create( + builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero); + mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg); + mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter()); + afterBlock->addArgument(resultType, loc); + builder.setInsertionPointToStart(afterBlock); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); + mlir::Value ret = mlir::NVVM::InlinePtxOp::create( + builder, loc, {resultType}, {barrier, args[1], ns}, {}, + "{\n" + " .reg .pred p;\n" + " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" + " selp.b32 %0, 1, 0, p;\n" + "}", + {}) + .getResult(0); + mlir::scf::YieldOp::create(builder, loc, ret); + builder.setInsertionPointAfter(whileOp); + return whileOp.getResult(0); +} + +// BARRIER_TRY_WAIT_SLEEP +mlir::Value +CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 3); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); + return mlir::NVVM::InlinePtxOp::create( + builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, + "{\n" + " .reg .pred p;\n" + " mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n" + " selp.b32 %0, 1, 0, p;\n" + "}", + {}) + .getResult(0); +} + +// FENCE_PROXY_ASYNC +void CUDAIntrinsicLibrary::genFenceProxyAsync( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + auto kind = mlir::NVVM::ProxyKindAttr::get( + builder.getContext(), mlir::NVVM::ProxyKind::async_shared); + auto space = mlir::NVVM::SharedSpaceAttr::get( + builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); + mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); +} + +// __LDCA, __LDCS, __LDLU, __LDCV +template <const char *fctName, int extent> +fir::ExtendedValue +CUDAIntrinsicLibrary::genLDXXFunc(mlir::Type resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 1); + mlir::Type resTy = fir::SequenceType::get(extent, resultType); + mlir::Value arg = fir::getBase(args[0]); + mlir::Value res = fir::AllocaOp::create(builder, loc, resTy); + if (mlir::isa<fir::BaseBoxType>(arg.getType())) + arg = fir::BoxAddrOp::create(builder, loc, arg); + mlir::Type refResTy = fir::ReferenceType::get(resTy); + mlir::FunctionType ftype = + mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {}); + auto funcOp = builder.createFunction(loc, fctName, ftype); + llvm::SmallVector<mlir::Value> funcArgs; + funcArgs.push_back(res); + funcArgs.push_back(arg); + fir::CallOp::create(builder, loc, funcOp, funcArgs); + mlir::Value ext = + builder.createIntegerConstant(loc, builder.getIndexType(), extent); + return fir::ArrayBoxValue(res, {ext}); +} + +// CLOCK, CLOCK64, GLOBALTIMER +template <typename OpTy> +mlir::Value +CUDAIntrinsicLibrary::genNVVMTime(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0 && "expect no arguments"); + return OpTy::create(builder, loc, resultType).getResult(); +} + +// MATCH_ALL_SYNC +mlir::Value +CUDAIntrinsicLibrary::genMatchAllSync(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 3); + bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); + + mlir::Type i1Ty = builder.getI1Type(); + mlir::MLIRContext *context = builder.getContext(); + + mlir::Value arg1 = args[1]; + if (arg1.getType().isF32() || arg1.getType().isF64()) + arg1 = fir::ConvertOp::create( + builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); + + mlir::Type retTy = + mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty}); + auto match = + mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1, + mlir::NVVM::MatchSyncKind::all) + .getResult(); + auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0); + auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1); + auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred); + fir::StoreOp::create(builder, loc, conv, args[2]); + return value; +} + +// MATCH_ANY_SYNC +mlir::Value +CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); + + mlir::Value arg1 = args[1]; + if (arg1.getType().isF32() || arg1.getType().isF64()) + arg1 = fir::ConvertOp::create( + builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); + + return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0], + arg1, mlir::NVVM::MatchSyncKind::any) + .getResult(); +} + +// SYNCTHREADS +void CUDAIntrinsicLibrary::genSyncThreads( + llvm::ArrayRef<fir::ExtendedValue> args) { + mlir::NVVM::Barrier0Op::create(builder, loc); +} + +// SYNCTHREADS_AND +mlir::Value +CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and"; + mlir::MLIRContext *context = builder.getContext(); + mlir::Type i32 = builder.getI32Type(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {i32}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + mlir::Value arg = builder.createConvert(loc, i32, args[0]); + return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); +} + +// SYNCTHREADS_COUNT +mlir::Value +CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc"; + mlir::MLIRContext *context = builder.getContext(); + mlir::Type i32 = builder.getI32Type(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {i32}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + mlir::Value arg = builder.createConvert(loc, i32, args[0]); + return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); +} + +// SYNCTHREADS_OR +mlir::Value +CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or"; + mlir::MLIRContext *context = builder.getContext(); + mlir::Type i32 = builder.getI32Type(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {i32}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + mlir::Value arg = builder.createConvert(loc, i32, args[0]); + return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); +} + +// SYNCWARP +void CUDAIntrinsicLibrary::genSyncWarp( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 1); + constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync"; + mlir::Value mask = fir::getBase(args[0]); + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector<mlir::Value> argsList{mask}; + fir::CallOp::create(builder, loc, funcOp, argsList); +} + +// THIS_GRID +mlir::Value +CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0); + auto recTy = mlir::cast<fir::RecordType>(resultType); + assert(recTy && "RecordType expepected"); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Type i32Ty = builder.getI32Type(); + + mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); + mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); + mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); + + mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty); + mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty); + mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty); + + mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); + mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); + mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); + mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty); + mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty); + mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty); + + // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) * + // (blockDim.x * gridDim.x); + mlir::Value resZ = + mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ); + mlir::Value resY = + mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY); + mlir::Value resX = + mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX); + mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY); + mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX); + + // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) + + // blockIdx.x; + // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) + + // ((threadIdx.z * blockDim.y) * blockDim.x) + + // (threadIdx.y * blockDim.x) + threadIdx.x + 1; + mlir::Value r1 = + mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY); + mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX); + mlir::Value r3 = + mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX); + mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); + mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX); + + mlir::Value bXbY = + mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY); + mlir::Value bXbYbZ = + mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ); + mlir::Value tZbY = + mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); + mlir::Value tZbYbX = + mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX); + mlir::Value tYbX = + mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); + mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ); + rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX); + rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX); + rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX); + mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); + rank = mlir::arith::AddIOp::create(builder, loc, rank, one); + + auto sizeFieldName = recTy.getTypeList()[1].first; + mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; + mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); + mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, sizeFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value sizeCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); + fir::StoreOp::create(builder, loc, size, sizeCoord); + + auto rankFieldName = recTy.getTypeList()[2].first; + mlir::Type rankFieldTy = recTy.getTypeList()[2].second; + mlir::Value rankFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, rankFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value rankCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); + fir::StoreOp::create(builder, loc, rank, rankCoord); + return res; +} + +// THIS_THREAD_BLOCK +mlir::Value +CUDAIntrinsicLibrary::genThisThreadBlock(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0); + auto recTy = mlir::cast<fir::RecordType>(resultType); + assert(recTy && "RecordType expepected"); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Type i32Ty = builder.getI32Type(); + + // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x; + mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); + mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); + mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); + mlir::Value size = + mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY); + size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX); + + // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) + + // (threadIdx.y * blockDim.x) + threadIdx.x + 1; + mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); + mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); + mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); + mlir::Value r1 = + mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); + mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX); + mlir::Value r3 = + mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); + mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); + mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX); + mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); + rank = mlir::arith::AddIOp::create(builder, loc, rank, one); + + auto sizeFieldName = recTy.getTypeList()[1].first; + mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; + mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); + mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, sizeFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value sizeCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); + fir::StoreOp::create(builder, loc, size, sizeCoord); + + auto rankFieldName = recTy.getTypeList()[2].first; + mlir::Type rankFieldTy = recTy.getTypeList()[2].second; + mlir::Value rankFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, rankFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value rankCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); + fir::StoreOp::create(builder, loc, rank, rankCoord); + return res; +} + +// THIS_WARP +mlir::Value +CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 0); + auto recTy = mlir::cast<fir::RecordType>(resultType); + assert(recTy && "RecordType expepected"); + mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); + mlir::Type i32Ty = builder.getI32Type(); + + // coalesced_group%size = 32 + mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32); + auto sizeFieldName = recTy.getTypeList()[1].first; + mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; + mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); + mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, sizeFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value sizeCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); + fir::StoreOp::create(builder, loc, size, sizeCoord); + + // coalesced_group%rank = threadIdx.x & 31 + 1 + mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); + mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31); + mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); + mlir::Value masked = + mlir::arith::AndIOp::create(builder, loc, threadIdX, mask); + mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one); + auto rankFieldName = recTy.getTypeList()[2].first; + mlir::Type rankFieldTy = recTy.getTypeList()[2].second; + mlir::Value rankFieldIndex = fir::FieldIndexOp::create( + builder, loc, fieldIndexType, rankFieldName, recTy, + /*typeParams=*/mlir::ValueRange{}); + mlir::Value rankCoord = fir::CoordinateOp::create( + builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); + fir::StoreOp::create(builder, loc, rank, rankCoord); + return res; +} + +// THREADFENCE +void CUDAIntrinsicLibrary::genThreadFence( + llvm::ArrayRef<fir::ExtendedValue> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl"; + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector<mlir::Value> noArgs; + fir::CallOp::create(builder, loc, funcOp, noArgs); +} + +// THREADFENCE_BLOCK +void CUDAIntrinsicLibrary::genThreadFenceBlock( + llvm::ArrayRef<fir::ExtendedValue> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta"; + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector<mlir::Value> noArgs; + fir::CallOp::create(builder, loc, funcOp, noArgs); +} + +// THREADFENCE_SYSTEM +void CUDAIntrinsicLibrary::genThreadFenceSystem( + llvm::ArrayRef<fir::ExtendedValue> args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys"; + mlir::FunctionType funcType = + mlir::FunctionType::get(builder.getContext(), {}, {}); + auto funcOp = builder.createFunction(loc, funcName, funcType); + llvm::SmallVector<mlir::Value> noArgs; + fir::CallOp::create(builder, loc, funcOp, noArgs); +} + +// TMA_BULK_COMMIT_GROUP +void CUDAIntrinsicLibrary::genTMABulkCommitGroup( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); +} + +// TMA_BULK_G2S +void CUDAIntrinsicLibrary::genTMABulkG2S( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value barrier = convertPtrToNVVMSpace( + builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); + mlir::Value dst = + convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]), + mlir::NVVM::NVVMMemorySpace::SharedCluster); + mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create( + builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); +} + +static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value barrier, mlir::Value src, + mlir::Value dst, mlir::Value nelem, + mlir::Value eleSize) { + mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize); + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + barrier = builder.createConvert(loc, llvmPtrTy, barrier); + dst = builder.createConvert(loc, llvmPtrTy, dst); + src = builder.createConvert(loc, llvmPtrTy, src); + mlir::NVVM::InlinePtxOp::create( + builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {}, + "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], " + "[%1], %2, [%3];", + {}); + mlir::NVVM::InlinePtxOp::create( + builder, loc, mlir::TypeRange{}, {barrier, size}, {}, + "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {}); +} + +// TMA_BULK_LOADC4 +void CUDAIntrinsicLibrary::genTMABulkLoadC4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADC8 +void CUDAIntrinsicLibrary::genTMABulkLoadC8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 16); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADI4 +void CUDAIntrinsicLibrary::genTMABulkLoadI4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADI8 +void CUDAIntrinsicLibrary::genTMABulkLoadI8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR2 +void CUDAIntrinsicLibrary::genTMABulkLoadR2( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 2); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR4 +void CUDAIntrinsicLibrary::genTMABulkLoadR4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_LOADR8 +void CUDAIntrinsicLibrary::genTMABulkLoadR8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 4); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), fir::getBase(args[3]), eleSize); +} + +// TMA_BULK_S2G +void CUDAIntrinsicLibrary::genTMABulkS2G( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]), + mlir::NVVM::NVVMMemorySpace::Shared); + mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create( + builder, loc, dst, src, fir::getBase(args[2]), {}, {}); + + mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, + "cp.async.bulk.commit_group;", {}); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, + builder.getI32IntegerAttr(0), {}); +} + +static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value src, mlir::Value dst, mlir::Value count, + mlir::Value eleSize) { + mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count); + src = convertPtrToNVVMSpace(builder, loc, src, + mlir::NVVM::NVVMMemorySpace::Shared); + dst = convertPtrToNVVMSpace(builder, loc, dst, + mlir::NVVM::NVVMMemorySpace::Global); + mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src, + size, {}, {}); + mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, + "cp.async.bulk.commit_group;", {}); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, + builder.getI32IntegerAttr(0), {}); +} + +// TMA_BULK_STORE_C4 +void CUDAIntrinsicLibrary::genTMABulkStoreC4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_C8 +void CUDAIntrinsicLibrary::genTMABulkStoreC8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 16); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_I4 +void CUDAIntrinsicLibrary::genTMABulkStoreI4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_I8 +void CUDAIntrinsicLibrary::genTMABulkStoreI8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R2 +void CUDAIntrinsicLibrary::genTMABulkStoreR2( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 2); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R4 +void CUDAIntrinsicLibrary::genTMABulkStoreR4( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 4); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_STORE_R8 +void CUDAIntrinsicLibrary::genTMABulkStoreR8( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 3); + mlir::Value eleSize = + builder.createIntegerConstant(loc, builder.getI32Type(), 8); + genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), + fir::getBase(args[2]), eleSize); +} + +// TMA_BULK_WAIT_GROUP +void CUDAIntrinsicLibrary::genTMABulkWaitGroup( + llvm::ArrayRef<fir::ExtendedValue> args) { + assert(args.size() == 0); + auto group = builder.getIntegerAttr(builder.getI32Type(), 0); + mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {}); +} + +// ALL_SYNC, ANY_SYNC, BALLOT_SYNC +template <mlir::NVVM::VoteSyncKind kind> +mlir::Value +CUDAIntrinsicLibrary::genVoteSync(mlir::Type resultType, + llvm::ArrayRef<mlir::Value> args) { + assert(args.size() == 2); + mlir::Value arg1 = + fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]); + mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot + ? builder.getI32Type() + : builder.getI1Type(); + auto voteRes = + mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind) + .getResult(); + return fir::ConvertOp::create(builder, loc, resultType, voteRes); +} + +} // namespace fir diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 15ea84565dd75..3eb60448fae38 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -16,6 +16,7 @@ #include "flang/Optimizer/Builder/IntrinsicCall.h" #include "flang/Common/static-multimap-view.h" #include "flang/Optimizer/Builder/BoxValue.h" +#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h" #include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/Character.h" #include "flang/Optimizer/Builder/Complex.h" @@ -50,7 +51,6 @@ #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/Math/IR/Math.h" -#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -108,34 +108,6 @@ using I = IntrinsicLibrary; /// argument is an optional variable in the current scope). static constexpr bool handleDynamicOptional = true; -/// TODO: Move all CUDA Fortran intrinsic handlers into its own file similar to -/// PPC. -static const char __ldca_i4x4[] = "__ldca_i4x4_"; -static const char __ldca_i8x2[] = "__ldca_i8x2_"; -static const char __ldca_r2x2[] = "__ldca_r2x2_"; -static const char __ldca_r4x4[] = "__ldca_r4x4_"; -static const char __ldca_r8x2[] = "__ldca_r8x2_"; -static const char __ldcg_i4x4[] = "__ldcg_i4x4_"; -static const char __ldcg_i8x2[] = "__ldcg_i8x2_"; -static const char __ldcg_r2x2[] = "__ldcg_r2x2_"; -static const char __ldcg_r4x4[] = "__ldcg_r4x4_"; -static const char __ldcg_r8x2[] = "__ldcg_r8x2_"; -static const char __ldcs_i4x4[] = "__ldcs_i4x4_"; -static const char __ldcs_i8x2[] = "__ldcs_i8x2_"; -static const char __ldcs_r2x2[] = "__ldcs_r2x2_"; -static const char __ldcs_r4x4[] = "__ldcs_r4x4_"; -static const char __ldcs_r8x2[] = "__ldcs_r8x2_"; -static const char __ldcv_i4x4[] = "__ldcv_i4x4_"; -static const char __ldcv_i8x2[] = "__ldcv_i8x2_"; -static const char __ldcv_r2x2[] = "__ldcv_r2x2_"; -static const char __ldcv_r4x4[] = "__ldcv_r4x4_"; -static const char __ldcv_r8x2[] = "__ldcv_r8x2_"; -static const char __ldlu_i4x4[] = "__ldlu_i4x4_"; -static const char __ldlu_i8x2[] = "__ldlu_i8x2_"; -static const char __ldlu_r2x2[] = "__ldlu_r2x2_"; -static const char __ldlu_r4x4[] = "__ldlu_r4x4_"; -static const char __ldlu_r8x2[] = "__ldlu_r8x2_"; - /// Table that drives the fir generation depending on the intrinsic or intrinsic /// module procedure one to one mapping with Fortran arguments. If no mapping is /// defined here for a generic intrinsic, genRuntimeCall will be called @@ -144,106 +116,6 @@ static const char __ldlu_r8x2[] = "__ldlu_r8x2_"; /// argument must not be lowered by value. In which case, the lowering rules /// should be provided for all the intrinsic arguments for completeness. static constexpr IntrinsicHandler handlers[]{ - {"__ldca_i4x4", - &I::genCUDALDXXFunc<__ldca_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_i8x2", - &I::genCUDALDXXFunc<__ldca_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_r2x2", - &I::genCUDALDXXFunc<__ldca_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_r4x4", - &I::genCUDALDXXFunc<__ldca_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldca_r8x2", - &I::genCUDALDXXFunc<__ldca_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_i4x4", - &I::genCUDALDXXFunc<__ldcg_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_i8x2", - &I::genCUDALDXXFunc<__ldcg_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_r2x2", - &I::genCUDALDXXFunc<__ldcg_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_r4x4", - &I::genCUDALDXXFunc<__ldcg_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcg_r8x2", - &I::genCUDALDXXFunc<__ldcg_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_i4x4", - &I::genCUDALDXXFunc<__ldcs_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_i8x2", - &I::genCUDALDXXFunc<__ldcs_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_r2x2", - &I::genCUDALDXXFunc<__ldcs_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_r4x4", - &I::genCUDALDXXFunc<__ldcs_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcs_r8x2", - &I::genCUDALDXXFunc<__ldcs_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_i4x4", - &I::genCUDALDXXFunc<__ldcv_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_i8x2", - &I::genCUDALDXXFunc<__ldcv_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_r2x2", - &I::genCUDALDXXFunc<__ldcv_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_r4x4", - &I::genCUDALDXXFunc<__ldcv_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldcv_r8x2", - &I::genCUDALDXXFunc<__ldcv_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_i4x4", - &I::genCUDALDXXFunc<__ldlu_i4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_i8x2", - &I::genCUDALDXXFunc<__ldlu_i8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_r2x2", - &I::genCUDALDXXFunc<__ldlu_r2x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_r4x4", - &I::genCUDALDXXFunc<__ldlu_r4x4, 4>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, - {"__ldlu_r8x2", - &I::genCUDALDXXFunc<__ldlu_r8x2, 2>, - {{{"a", asAddr}}}, - /*isElemental=*/false}, {"abort", &I::genAbort}, {"abs", &I::genAbs}, {"achar", &I::genChar}, @@ -263,10 +135,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genAll, {{{"mask", asAddr}, {"dim", asValue}}}, /*isElemental=*/false}, - {"all_sync", - &I::genVoteSync<mlir::NVVM::VoteSyncKind::all>, - {{{"mask", asValue}, {"pred", asValue}}}, - /*isElemental=*/false}, {"allocated", &I::genAllocated, {{{"array", asInquired}, {"scalar", asInquired}}}, @@ -276,10 +144,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genAny, {{{"mask", asAddr}, {"dim", asValue}}}, /*isElemental=*/false}, - {"any_sync", - &I::genVoteSync<mlir::NVVM::VoteSyncKind::any>, - {{{"mask", asValue}, {"pred", asValue}}}, - /*isElemental=*/false}, {"asind", &I::genAsind}, {"asinpi", &I::genAsinpi}, {"associated", @@ -290,83 +154,6 @@ static constexpr IntrinsicHandler handlers[]{ {"atan2pi", &I::genAtanpi}, {"atand", &I::genAtand}, {"atanpi", &I::genAtanpi}, - {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomiccasd", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomiccasf", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomiccasi", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomiccasul", - &I::genAtomicCas, - {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}}, - false}, - {"atomicdeci", &I::genAtomicDec, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicexchd", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicexchf", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicexchi", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicexchul", - &I::genAtomicExch, - {{{"a", asAddr}, {"v", asValue}}}, - false}, - {"atomicinci", &I::genAtomicInc, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxd", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxf", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxi", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmaxl", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmind", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicminf", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicmini", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicminl", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicori", &I::genAtomicOr, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubd", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubf", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubi", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicsubl", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"atomicxori", &I::genAtomicXor, {{{"a", asAddr}, {"v", asValue}}}, false}, - {"ballot_sync", - &I::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>, - {{{"mask", asValue}, {"pred", asValue}}}, - /*isElemental=*/false}, - {"barrier_arrive", - &I::genBarrierArrive, - {{{"barrier", asAddr}}}, - /*isElemental=*/false}, - {"barrier_arrive_cnt", - &I::genBarrierArriveCnt, - {{{"barrier", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"barrier_init", - &I::genBarrierInit, - {{{"barrier", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"barrier_try_wait", - &I::genBarrierTryWait, - {{{"barrier", asAddr}, {"token", asValue}}}, - /*isElemental=*/false}, - {"barrier_try_wait_sleep", - &I::genBarrierTryWaitSleep, - {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}}, - /*isElemental=*/false}, {"bessel_jn", &I::genBesselJn, {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}}, @@ -410,11 +197,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genChdir, {{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}}, /*isElemental=*/false}, - {"clock", &I::genNVVMTime<mlir::NVVM::ClockOp>, {}, /*isElemental=*/false}, - {"clock64", - &I::genNVVMTime<mlir::NVVM::Clock64Op>, - {}, - /*isElemental=*/false}, {"cmplx", &I::genCmplx, {{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}}, @@ -511,10 +293,6 @@ static constexpr IntrinsicHandler handlers[]{ &I::genExtendsTypeOf, {{{"a", asBox}, {"mold", asBox}}}, /*isElemental=*/false}, - {"fence_proxy_async", - &I::genFenceProxyAsync, - {}, - /*isElemental=*/false}, {"findloc", &I::genFindloc, {{{"array", asBox}, @@ -569,10 +347,6 @@ static constexpr IntrinsicHandler handlers[]{ {"getgid", &I::genGetGID}, {"getpid", &I::genGetPID}, {"getuid", &I::genGetUID}, - {"globaltimer", - &I::genNVVMTime<mlir::NVVM::GlobalTimerOp>, - {}, - /*isElemental=*/false}, {"hostnm", &I::genHostnm, {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}}, @@ -740,38 +514,6 @@ static constexpr IntrinsicHandler handlers[]{ {"malloc", &I::genMalloc}, {"maskl", &I::genMask<mlir::arith::ShLIOp>}, {"maskr", &I::genMask<mlir::arith::ShRUIOp>}, - {"match_all_syncjd", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_all_syncjf", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_all_syncjj", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_all_syncjx", - &I::genMatchAllSync, - {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}}, - /*isElemental=*/false}, - {"match_any_syncjd", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, - {"match_any_syncjf", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, - {"match_any_syncjj", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, - {"match_any_syncjx", - &I::genMatchAnySync, - {{{"mask", asValue}, {"value", asValue}}}, - /*isElemental=*/false}, {"matmul", &I::genMatmul, {{{"matrix_a", asAddr}, {"matrix_b", asAddr}}}, @@ -997,20 +739,6 @@ static constexpr IntrinsicHandler handlers[]{ {"dim", asValue}, {"mask", asBox, handleDynamicOptional}}}, /*isElemental=*/false}, - {"syncthreads", &I::genSyncThreads, {}, /*isElemental=*/false}, - {"syncthreads_and_i4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false}, - {"syncthreads_and_l4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false}, - {"syncthreads_count_i4", - &I::genSyncThreadsCount, - {}, - /*isElemental=*/false}, - {"syncthreads_count_l4", - &I::genSyncThreadsCount, - {}, - /*isElemental=*/false}, - {"syncthreads_or_i4", &I::genSyncThreadsOr, {}, /*isElemental=*/false}, - {"syncthreads_or_l4", &I::genSyncThreadsOr, {}, /*isElemental=*/false}, - {"syncwarp", &I::genSyncWarp, {}, /*isElemental=*/false}, {"system", &I::genSystem, {{{"command", asBox}, {"exitstat", asBox, handleDynamicOptional}}}, @@ -1021,115 +749,13 @@ static constexpr IntrinsicHandler handlers[]{ /*isElemental=*/false}, {"tand", &I::genTand}, {"tanpi", &I::genTanpi}, - {"this_grid", &I::genThisGrid, {}, /*isElemental=*/false}, {"this_image", &I::genThisImage, {{{"coarray", asBox}, {"dim", asAddr}, {"team", asBox, handleDynamicOptional}}}, /*isElemental=*/false}, - {"this_thread_block", &I::genThisThreadBlock, {}, /*isElemental=*/false}, - {"this_warp", &I::genThisWarp, {}, /*isElemental=*/false}, - {"threadfence", &I::genThreadFence, {}, /*isElemental=*/false}, - {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false}, - {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false}, {"time", &I::genTime, {}, /*isElemental=*/false}, - {"tma_bulk_commit_group", - &I::genTMABulkCommitGroup, - {{}}, - /*isElemental=*/false}, - {"tma_bulk_g2s", - &I::genTMABulkG2S, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nbytes", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldc4", - &I::genTMABulkLoadC4, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldc8", - &I::genTMABulkLoadC8, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldi4", - &I::genTMABulkLoadI4, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldi8", - &I::genTMABulkLoadI8, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldr2", - &I::genTMABulkLoadR2, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldr4", - &I::genTMABulkLoadR4, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_ldr8", - &I::genTMABulkLoadR8, - {{{"barrier", asAddr}, - {"src", asAddr}, - {"dst", asAddr}, - {"nelems", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_s2g", - &I::genTMABulkS2G, - {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_c4", - &I::genTMABulkStoreC4, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_c8", - &I::genTMABulkStoreC8, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_i4", - &I::genTMABulkStoreI4, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_i8", - &I::genTMABulkStoreI8, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_r2", - &I::genTMABulkStoreR2, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_r4", - &I::genTMABulkStoreR4, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_store_r8", - &I::genTMABulkStoreR8, - {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}}, - /*isElemental=*/false}, - {"tma_bulk_wait_group", - &I::genTMABulkWaitGroup, - {{}}, - /*isElemental=*/false}, {"trailz", &I::genTrailz}, {"transfer", &I::genTransfer, @@ -2221,6 +1847,9 @@ lookupIntrinsicHandler(fir::FirOpBuilder &builder, if (isPPCTarget) if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name)) return std::make_optional<IntrinsicHandlerEntry>(ppcHandler); + // TODO: Look for CUDA intrinsic handlers only if CUDA is enabled. + if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name)) + return std::make_optional<IntrinsicHandlerEntry>(cudaHandler); // Subroutines should have a handler. if (!resultType) return std::nullopt; @@ -3107,159 +2736,6 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType, return mlir::arith::MulFOp::create(builder, loc, atan, factor); } -static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc, - mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0, - mlir::Value arg1) { - auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - arg0 = builder.createConvert(loc, llvmPointerType, arg0); - return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1, - mlir::LLVM::AtomicOrdering::seq_cst); -} - -mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::add - : mlir::LLVM::AtomicBinOp::fadd; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::sub - : mlir::LLVM::AtomicBinOp::fsub; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicAnd(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicOr(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -// ATOMICCAS -fir::ExtendedValue -IntrinsicLibrary::genAtomicCas(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel; - auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic; - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext()); - - mlir::Value arg0 = fir::getBase(args[0]); - mlir::Value arg1 = fir::getBase(args[1]); - mlir::Value arg2 = fir::getBase(args[2]); - - auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value { - if (mlir::isa<mlir::Float32Type>(arg.getType())) - return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(), - arg); - if (mlir::isa<mlir::Float64Type>(arg.getType())) - return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(), - arg); - return arg; - }; - - arg1 = bitCastFloat(arg1); - arg2 = bitCastFloat(arg2); - - if (arg1.getType() != arg2.getType()) { - // arg1 and arg2 need to have the same type in AtomicCmpXchgOp. - arg2 = builder.createConvert(loc, arg1.getType(), arg2); - } - - auto address = - mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0) - .getResult(0); - auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create( - builder, loc, address, arg1, arg2, successOrdering, failureOrdering); - mlir::Value boolResult = - mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1); - return builder.createConvert(loc, resultType, boolResult); -} - -mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -// ATOMICEXCH -fir::ExtendedValue -IntrinsicLibrary::genAtomicExch(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - mlir::Value arg0 = fir::getBase(args[0]); - mlir::Value arg1 = fir::getBase(args[1]); - assert(arg1.getType().isIntOrFloat()); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg; - return genAtomBinOp(builder, loc, binOp, arg0, arg1); -} - -mlir::Value IntrinsicLibrary::genAtomicInc(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - assert(mlir::isa<mlir::IntegerType>(args[1].getType())); - - mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicMax(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::max - : mlir::LLVM::AtomicBinOp::fmax; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -mlir::Value IntrinsicLibrary::genAtomicMin(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - - mlir::LLVM::AtomicBinOp binOp = - mlir::isa<mlir::IntegerType>(args[1].getType()) - ? mlir::LLVM::AtomicBinOp::min - : mlir::LLVM::AtomicBinOp::fmin; - return genAtomBinOp(builder, loc, binOp, args[0], args[1]); -} - -// ATOMICXOR -fir::ExtendedValue -IntrinsicLibrary::genAtomicXor(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - mlir::Value arg0 = fir::getBase(args[0]); - mlir::Value arg1 = fir::getBase(args[1]); - return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1); -} - // ASSOCIATED fir::ExtendedValue IntrinsicLibrary::genAssociated(mlir::Type resultType, @@ -3311,114 +2787,6 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType, return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox); } -static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder, - mlir::Location loc, - mlir::Value barrier, - mlir::NVVM::NVVMMemorySpace space) { - mlir::Value llvmPtr = fir::ConvertOp::create( - builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()), - barrier); - mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create( - builder, loc, - mlir::LLVM::LLVMPointerType::get(builder.getContext(), - static_cast<unsigned>(space)), - llvmPtr); - return addrCast; -} - -// BARRIER_ARRIVE (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierArrive(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 1); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); - return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType, - barrier) - .getResult(); -} - -// BARRIER_ARRIBVE_CNT (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared); - mlir::Value token = fir::AllocaOp::create(builder, loc, resultType); - // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and - // currently just the sink symbol `_`. - // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive - mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1], - {}); - return fir::LoadOp::create(builder, loc, token); -} - -// BARRIER_INIT (CUDA) -void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 2); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); - mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier, - fir::getBase(args[1]), {}); - auto kind = mlir::NVVM::ProxyKindAttr::get( - builder.getContext(), mlir::NVVM::ProxyKind::async_shared); - auto space = mlir::NVVM::SharedSpaceAttr::get( - builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); - mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); -} - -// BARRIER_TRY_WAIT (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0); - fir::StoreOp::create(builder, loc, zero, res); - mlir::Value ns = - builder.createIntegerConstant(loc, builder.getI32Type(), 1000000); - mlir::Value load = fir::LoadOp::create(builder, loc, res); - auto whileOp = mlir::scf::WhileOp::create( - builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load}); - mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore()); - mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc); - builder.setInsertionPointToStart(beforeBlock); - mlir::Value condition = mlir::arith::CmpIOp::create( - builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero); - mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg); - mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter()); - afterBlock->addArgument(resultType, loc); - builder.setInsertionPointToStart(afterBlock); - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); - mlir::Value ret = - mlir::NVVM::InlinePtxOp::create( - builder, loc, {resultType}, {barrier, args[1], ns}, {}, - ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " - "selp.b32 %0, 1, 0, p;", - {}) - .getResult(0); - mlir::scf::YieldOp::create(builder, loc, ret); - builder.setInsertionPointAfter(whileOp); - return whileOp.getResult(0); -} - -// BARRIER_TRY_WAIT_SLEEP (CUDA) -mlir::Value -IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 3); - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]); - return mlir::NVVM::InlinePtxOp::create( - builder, loc, {resultType}, {barrier, args[1], args[2]}, {}, - ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; " - "selp.b32 %0, 1, 0, p;", - {}) - .getResult(0); -} - // BESSEL_JN fir::ExtendedValue IntrinsicLibrary::genBesselJn(mlir::Type resultType, @@ -4152,30 +3520,6 @@ IntrinsicLibrary::genCshift(mlir::Type resultType, return readAndAddCleanUp(resultMutableBox, resultType, "CSHIFT"); } -// __LDCA, __LDCS, __LDLU, __LDCV -template <const char *fctName, int extent> -fir::ExtendedValue -IntrinsicLibrary::genCUDALDXXFunc(mlir::Type resultType, - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 1); - mlir::Type resTy = fir::SequenceType::get(extent, resultType); - mlir::Value arg = fir::getBase(args[0]); - mlir::Value res = fir::AllocaOp::create(builder, loc, resTy); - if (mlir::isa<fir::BaseBoxType>(arg.getType())) - arg = fir::BoxAddrOp::create(builder, loc, arg); - mlir::Type refResTy = fir::ReferenceType::get(resTy); - mlir::FunctionType ftype = - mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {}); - auto funcOp = builder.createFunction(loc, fctName, ftype); - llvm::SmallVector<mlir::Value> funcArgs; - funcArgs.push_back(res); - funcArgs.push_back(arg); - fir::CallOp::create(builder, loc, funcOp, funcArgs); - mlir::Value ext = - builder.createIntegerConstant(loc, builder.getIndexType(), extent); - return fir::ArrayBoxValue(res, {ext}); -} - // DATE_AND_TIME void IntrinsicLibrary::genDateAndTime(llvm::ArrayRef<fir::ExtendedValue> args) { assert(args.size() == 4 && "date_and_time has 4 args"); @@ -4508,17 +3852,6 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType, fir::getBase(args[1]))); } -// FENCE_PROXY_ASYNC (CUDA) -void IntrinsicLibrary::genFenceProxyAsync( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 0); - auto kind = mlir::NVVM::ProxyKindAttr::get( - builder.getContext(), mlir::NVVM::ProxyKind::async_shared); - auto space = mlir::NVVM::SharedSpaceAttr::get( - builder.getContext(), mlir::NVVM::SharedSpace::shared_cta); - mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space); -} - // FINDLOC fir::ExtendedValue IntrinsicLibrary::genFindloc(mlir::Type resultType, @@ -7029,67 +6362,6 @@ mlir::Value IntrinsicLibrary::genMask(mlir::Type resultType, return result; } -// MATCH_ALL_SYNC -mlir::Value -IntrinsicLibrary::genMatchAllSync(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 3); - bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); - - mlir::Type i1Ty = builder.getI1Type(); - mlir::MLIRContext *context = builder.getContext(); - - mlir::Value arg1 = args[1]; - if (arg1.getType().isF32() || arg1.getType().isF64()) - arg1 = fir::ConvertOp::create( - builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); - - mlir::Type retTy = - mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty}); - auto match = - mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1, - mlir::NVVM::MatchSyncKind::all) - .getResult(); - auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0); - auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1); - auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred); - fir::StoreOp::create(builder, loc, conv, args[2]); - return value; -} - -// ALL_SYNC, ANY_SYNC, BALLOT_SYNC -template <mlir::NVVM::VoteSyncKind kind> -mlir::Value IntrinsicLibrary::genVoteSync(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - mlir::Value arg1 = - fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]); - mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot - ? builder.getI32Type() - : builder.getI1Type(); - auto voteRes = - mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind) - .getResult(); - return fir::ConvertOp::create(builder, loc, resultType, voteRes); -} - -// MATCH_ANY_SYNC -mlir::Value -IntrinsicLibrary::genMatchAnySync(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 2); - bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); - - mlir::Value arg1 = args[1]; - if (arg1.getType().isF32() || arg1.getType().isF64()) - arg1 = fir::ConvertOp::create( - builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); - - return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0], - arg1, mlir::NVVM::MatchSyncKind::any) - .getResult(); -} - // MATMUL fir::ExtendedValue IntrinsicLibrary::genMatmul(mlir::Type resultType, @@ -7707,14 +6979,6 @@ IntrinsicLibrary::genNumImages(mlir::Type resultType, return mif::NumImagesOp::create(builder, loc).getResult(); } -// CLOCK, CLOCK64, GLOBALTIMER -template <typename OpTy> -mlir::Value IntrinsicLibrary::genNVVMTime(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0 && "expect no arguments"); - return OpTy::create(builder, loc, resultType).getResult(); -} - // PACK fir::ExtendedValue IntrinsicLibrary::genPack(mlir::Type resultType, @@ -8689,92 +7953,6 @@ mlir::Value IntrinsicLibrary::genTanpi(mlir::Type resultType, return getRuntimeCallGenerator("tan", ftype)(builder, loc, {arg}); } -// THIS_GRID -mlir::Value IntrinsicLibrary::genThisGrid(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0); - auto recTy = mlir::cast<fir::RecordType>(resultType); - assert(recTy && "RecordType expepected"); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Type i32Ty = builder.getI32Type(); - - mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); - mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); - mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); - - mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty); - mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty); - mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty); - - mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); - mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); - mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); - mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty); - mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty); - mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty); - - // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) * - // (blockDim.x * gridDim.x); - mlir::Value resZ = - mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ); - mlir::Value resY = - mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY); - mlir::Value resX = - mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX); - mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY); - mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX); - - // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) + - // blockIdx.x; - // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) + - // ((threadIdx.z * blockDim.y) * blockDim.x) + - // (threadIdx.y * blockDim.x) + threadIdx.x + 1; - mlir::Value r1 = - mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY); - mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX); - mlir::Value r3 = - mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX); - mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); - mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX); - - mlir::Value bXbY = - mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY); - mlir::Value bXbYbZ = - mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ); - mlir::Value tZbY = - mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); - mlir::Value tZbYbX = - mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX); - mlir::Value tYbX = - mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); - mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ); - rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX); - rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX); - rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX); - mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); - rank = mlir::arith::AddIOp::create(builder, loc, rank, one); - - auto sizeFieldName = recTy.getTypeList()[1].first; - mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; - mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); - mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, sizeFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value sizeCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); - fir::StoreOp::create(builder, loc, size, sizeCoord); - - auto rankFieldName = recTy.getTypeList()[2].first; - mlir::Type rankFieldTy = recTy.getTypeList()[2].second; - mlir::Value rankFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, rankFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value rankCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); - fir::StoreOp::create(builder, loc, rank, rankCoord); - return res; -} - // THIS_IMAGE fir::ExtendedValue IntrinsicLibrary::genThisImage(mlir::Type resultType, @@ -8790,99 +7968,6 @@ IntrinsicLibrary::genThisImage(mlir::Type resultType, return builder.createConvert(loc, resultType, res); } -// THIS_THREAD_BLOCK -mlir::Value -IntrinsicLibrary::genThisThreadBlock(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0); - auto recTy = mlir::cast<fir::RecordType>(resultType); - assert(recTy && "RecordType expepected"); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Type i32Ty = builder.getI32Type(); - - // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x; - mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty); - mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty); - mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty); - mlir::Value size = - mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY); - size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX); - - // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) + - // (threadIdx.y * blockDim.x) + threadIdx.x + 1; - mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); - mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty); - mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty); - mlir::Value r1 = - mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY); - mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX); - mlir::Value r3 = - mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX); - mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3); - mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX); - mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); - rank = mlir::arith::AddIOp::create(builder, loc, rank, one); - - auto sizeFieldName = recTy.getTypeList()[1].first; - mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; - mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); - mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, sizeFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value sizeCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); - fir::StoreOp::create(builder, loc, size, sizeCoord); - - auto rankFieldName = recTy.getTypeList()[2].first; - mlir::Type rankFieldTy = recTy.getTypeList()[2].second; - mlir::Value rankFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, rankFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value rankCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); - fir::StoreOp::create(builder, loc, rank, rankCoord); - return res; -} - -// THIS_WARP -mlir::Value IntrinsicLibrary::genThisWarp(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - assert(args.size() == 0); - auto recTy = mlir::cast<fir::RecordType>(resultType); - assert(recTy && "RecordType expepected"); - mlir::Value res = fir::AllocaOp::create(builder, loc, resultType); - mlir::Type i32Ty = builder.getI32Type(); - - // coalesced_group%size = 32 - mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32); - auto sizeFieldName = recTy.getTypeList()[1].first; - mlir::Type sizeFieldTy = recTy.getTypeList()[1].second; - mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext()); - mlir::Value sizeFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, sizeFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value sizeCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex); - fir::StoreOp::create(builder, loc, size, sizeCoord); - - // coalesced_group%rank = threadIdx.x & 31 + 1 - mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty); - mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31); - mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1); - mlir::Value masked = - mlir::arith::AndIOp::create(builder, loc, threadIdX, mask); - mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one); - auto rankFieldName = recTy.getTypeList()[2].first; - mlir::Type rankFieldTy = recTy.getTypeList()[2].second; - mlir::Value rankFieldIndex = fir::FieldIndexOp::create( - builder, loc, fieldIndexType, rankFieldName, recTy, - /*typeParams=*/mlir::ValueRange{}); - mlir::Value rankCoord = fir::CoordinateOp::create( - builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex); - fir::StoreOp::create(builder, loc, rank, rankCoord); - return res; -} - // TRAILZ mlir::Value IntrinsicLibrary::genTrailz(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { @@ -9104,65 +8189,6 @@ IntrinsicLibrary::genSum(mlir::Type resultType, resultType, args); } -// SYNCTHREADS -void IntrinsicLibrary::genSyncThreads(llvm::ArrayRef<fir::ExtendedValue> args) { - mlir::NVVM::Barrier0Op::create(builder, loc); -} - -// SYNCTHREADS_AND -mlir::Value -IntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and"; - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32 = builder.getI32Type(); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {resultType}, {i32}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - mlir::Value arg = builder.createConvert(loc, i32, args[0]); - return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); -} - -// SYNCTHREADS_COUNT -mlir::Value -IntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc"; - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32 = builder.getI32Type(); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {resultType}, {i32}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - mlir::Value arg = builder.createConvert(loc, i32, args[0]); - return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); -} - -// SYNCTHREADS_OR -mlir::Value -IntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType, - llvm::ArrayRef<mlir::Value> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or"; - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32 = builder.getI32Type(); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {resultType}, {i32}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - mlir::Value arg = builder.createConvert(loc, i32, args[0]); - return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0); -} - -// SYNCWARP -void IntrinsicLibrary::genSyncWarp(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 1); - constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync"; - mlir::Value mask = fir::getBase(args[0]); - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> argsList{mask}; - fir::CallOp::create(builder, loc, funcOp, argsList); -} - // SYSTEM fir::ExtendedValue IntrinsicLibrary::genSystem(std::optional<mlir::Type> resultType, @@ -9294,38 +8320,6 @@ IntrinsicLibrary::genTranspose(mlir::Type resultType, return readAndAddCleanUp(resultMutableBox, resultType, "TRANSPOSE"); } -// THREADFENCE -void IntrinsicLibrary::genThreadFence(llvm::ArrayRef<fir::ExtendedValue> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); -} - -// THREADFENCE_BLOCK -void IntrinsicLibrary::genThreadFenceBlock( - llvm::ArrayRef<fir::ExtendedValue> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); -} - -// THREADFENCE_SYSTEM -void IntrinsicLibrary::genThreadFenceSystem( - llvm::ArrayRef<fir::ExtendedValue> args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys"; - mlir::FunctionType funcType = - mlir::FunctionType::get(builder.getContext(), {}, {}); - auto funcOp = builder.createFunction(loc, funcName, funcType); - llvm::SmallVector<mlir::Value> noArgs; - fir::CallOp::create(builder, loc, funcOp, noArgs); -} - // TIME mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args) { @@ -9334,226 +8328,6 @@ mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType, fir::runtime::genTime(builder, loc)); } -// TMA_BULK_COMMIT_GROUP (CUDA) -void IntrinsicLibrary::genTMABulkCommitGroup( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 0); - mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc); -} - -// TMA_BULK_G2S (CUDA) -void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value barrier = convertPtrToNVVMSpace( - builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared); - mlir::Value dst = - convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]), - mlir::NVVM::NVVMMemorySpace::SharedCluster); - mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), - mlir::NVVM::NVVMMemorySpace::Global); - mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create( - builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {}); -} - -static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::Value barrier, mlir::Value src, - mlir::Value dst, mlir::Value nelem, - mlir::Value eleSize) { - mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize); - auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); - barrier = builder.createConvert(loc, llvmPtrTy, barrier); - dst = builder.createConvert(loc, llvmPtrTy, dst); - src = builder.createConvert(loc, llvmPtrTy, src); - mlir::NVVM::InlinePtxOp::create( - builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {}, - "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], " - "[%1], %2, [%3];", - {}); - mlir::NVVM::InlinePtxOp::create( - builder, loc, mlir::TypeRange{}, {barrier, size}, {}, - "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {}); -} - -// TMA_BULK_LOADC4 -void IntrinsicLibrary::genTMABulkLoadC4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADC8 -void IntrinsicLibrary::genTMABulkLoadC8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 16); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADI4 -void IntrinsicLibrary::genTMABulkLoadI4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADI8 -void IntrinsicLibrary::genTMABulkLoadI8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADR2 -void IntrinsicLibrary::genTMABulkLoadR2( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 2); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADR4 -void IntrinsicLibrary::genTMABulkLoadR4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_LOADR8 -void IntrinsicLibrary::genTMABulkLoadR8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 4); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), fir::getBase(args[3]), eleSize); -} - -// TMA_BULK_S2G (CUDA) -void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]), - mlir::NVVM::NVVMMemorySpace::Shared); - mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]), - mlir::NVVM::NVVMMemorySpace::Global); - mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create( - builder, loc, dst, src, fir::getBase(args[2]), {}, {}); - - mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, - "cp.async.bulk.commit_group", {}); - mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, - builder.getI32IntegerAttr(0), {}); -} - -static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::Value src, mlir::Value dst, mlir::Value count, - mlir::Value eleSize) { - mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count); - src = convertPtrToNVVMSpace(builder, loc, src, - mlir::NVVM::NVVMMemorySpace::Shared); - dst = convertPtrToNVVMSpace(builder, loc, dst, - mlir::NVVM::NVVMMemorySpace::Global); - mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src, - size, {}, {}); - mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {}, - "cp.async.bulk.commit_group", {}); - mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, - builder.getI32IntegerAttr(0), {}); -} - -// TMA_BULK_STORE_C4 (CUDA) -void IntrinsicLibrary::genTMABulkStoreC4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_C8 (CUDA) -void IntrinsicLibrary::genTMABulkStoreC8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 16); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_I4 (CUDA) -void IntrinsicLibrary::genTMABulkStoreI4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_I8 (CUDA) -void IntrinsicLibrary::genTMABulkStoreI8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_R2 (CUDA) -void IntrinsicLibrary::genTMABulkStoreR2( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 2); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_R4 (CUDA) -void IntrinsicLibrary::genTMABulkStoreR4( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 4); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_STORE_R8 (CUDA) -void IntrinsicLibrary::genTMABulkStoreR8( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 3); - mlir::Value eleSize = - builder.createIntegerConstant(loc, builder.getI32Type(), 8); - genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]), - fir::getBase(args[2]), eleSize); -} - -// TMA_BULK_WAIT_GROUP (CUDA) -void IntrinsicLibrary::genTMABulkWaitGroup( - llvm::ArrayRef<fir::ExtendedValue> args) { - assert(args.size() == 0); - auto group = builder.getIntegerAttr(builder.getI32Type(), 0); - mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {}); -} - // TRIM fir::ExtendedValue IntrinsicLibrary::genTrim(mlir::Type resultType, @@ -9968,6 +8742,9 @@ getIntrinsicArgumentLowering(llvm::StringRef specificName) { if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name)) if (!ppcHandler->argLoweringRules.hasDefaultRules()) return &ppcHandler->argLoweringRules; + if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name)) + if (!cudaHandler->argLoweringRules.hasDefaultRules()) + return &cudaHandler->argLoweringRules; return nullptr; } diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index a9de26ea09ff8..4374acbbe51bf 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -1778,6 +1778,31 @@ struct OmpBlockConstructParser { llvm::omp::Directive dir_; }; +struct OmpDeclarativeAllocateParser { + using resultType = OmpAllocateDirective; + + std::optional<resultType> Parse(ParseState &state) const { + constexpr llvm::omp::Directive dir{llvm::omp::Directive::OMPD_allocate}; + if (auto &&begin{attempt(OmpBeginDirectiveParser(dir)).Parse(state)}) { + Block empty; + auto end{maybe(OmpEndDirectiveParser{dir}).Parse(state)}; + return OmpAllocateDirective{std::move(*begin), std::move(empty), + llvm::transformOptional(std::move(*end), + [](auto &&s) { return OmpEndDirective(std::move(s)); })}; + } + return std::nullopt; + } +}; + +struct OmpExecutableAllocateParser { + using resultType = OmpAllocateDirective; + + std::optional<resultType> Parse(ParseState &state) const { + OmpStatementConstructParser p{llvm::omp::Directive::OMPD_allocate}; + return construct<OmpAllocateDirective>(p).Parse(state); + } +}; + TYPE_PARSER(sourced(construct<OpenMPAllocatorsConstruct>( OmpStatementConstructParser{llvm::omp::Directive::OMPD_allocators}))) @@ -2044,14 +2069,6 @@ TYPE_PARSER(construct<OmpInitializerExpression>(OmpStylizedExpressionParser{})) TYPE_PARSER(sourced(construct<OpenMPCriticalConstruct>( OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical}))) -// 2.11.3 Executable Allocate directive -TYPE_PARSER(sourced(construct<OpenMPExecutableAllocate>( - verbatim("ALLOCATE"_tok), maybe(parenthesized(Parser<OmpObjectList>{})), - Parser<OmpClauseList>{}, - maybe(nonemptyList(startOmpLine >> Parser<OpenMPDeclarativeAllocate>{})) / - endOmpLine, - statement(allocateStmt)))) - // 2.8.2 Declare Simd construct TYPE_PARSER(sourced(construct<OpenMPDeclareSimdConstruct>( predicated(Parser<OmpDirectiveName>{}, @@ -2077,13 +2094,6 @@ TYPE_PARSER(sourced( // IsDirective(llvm::omp::Directive::OMPD_threadprivate)) >= Parser<OmpDirectiveSpecification>{}))) -// 2.11.3 Declarative Allocate directive -TYPE_PARSER( - sourced(construct<OpenMPDeclarativeAllocate>(verbatim("ALLOCATE"_tok), - maybe(parenthesized(Parser<OmpObjectList>{})), - Parser<OmpClauseList>{})) / - lookAhead(endOmpLine / !statement(allocateStmt))) - // Assumes Construct TYPE_PARSER(sourced(construct<OpenMPDeclarativeAssumes>( predicated(OmpDirectiveNameParser{}, @@ -2106,7 +2116,7 @@ TYPE_PARSER( construct<OpenMPDeclarativeConstruct>( Parser<OmpDeclareVariantDirective>{}) || construct<OpenMPDeclarativeConstruct>( - Parser<OpenMPDeclarativeAllocate>{}) || + sourced(OmpDeclarativeAllocateParser{})) || construct<OpenMPDeclarativeConstruct>( Parser<OpenMPGroupprivate>{}) || construct<OpenMPDeclarativeConstruct>( @@ -2194,6 +2204,8 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US, withMessage("expected OpenMP construct"_err_en_US, first(construct<OpenMPConstruct>(Parser<OpenMPSectionsConstruct>{}), construct<OpenMPConstruct>(Parser<OpenMPLoopConstruct>{}), + construct<OpenMPConstruct>( + sourced(OmpExecutableAllocateParser{})), construct<OpenMPConstruct>(Parser<OmpBlockConstruct>{}), // OmpBlockConstruct is attempted before // OpenMPStandaloneConstruct to resolve !$OMP ORDERED @@ -2201,9 +2213,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US, construct<OpenMPConstruct>(Parser<OpenMPAtomicConstruct>{}), construct<OpenMPConstruct>(Parser<OpenMPUtilityConstruct>{}), construct<OpenMPConstruct>(Parser<OpenMPDispatchConstruct>{}), - construct<OpenMPConstruct>(Parser<OpenMPExecutableAllocate>{}), construct<OpenMPConstruct>(Parser<OpenMPAllocatorsConstruct>{}), - construct<OpenMPConstruct>(Parser<OpenMPDeclarativeAllocate>{}), construct<OpenMPConstruct>(Parser<OpenMPAssumeConstruct>{}), construct<OpenMPConstruct>(Parser<OpenMPCriticalConstruct>{})))) diff --git a/flang/lib/Parser/openmp-utils.cpp b/flang/lib/Parser/openmp-utils.cpp index 95ad3f60770f5..b9d3763cdd06d 100644 --- a/flang/lib/Parser/openmp-utils.cpp +++ b/flang/lib/Parser/openmp-utils.cpp @@ -22,6 +22,25 @@ namespace Fortran::parser::omp { +const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x) { + if (auto *y = std::get_if<SpecificationConstruct>(&x.u)) { + if (auto *z{std::get_if<common::Indirection<OpenMPDeclarativeConstruct>>( + &y->u)}) { + return &z->value(); + } + } + return nullptr; +} + +const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x) { + if (auto *y{std::get_if<ExecutableConstruct>(&x.u)}) { + if (auto *z{std::get_if<common::Indirection<OpenMPConstruct>>(&y->u)}) { + return &z->value(); + } + } + return nullptr; +} + const OmpObjectList *GetOmpObjectList(const OmpClause &clause) { // Clauses with OmpObjectList as its data member using MemberObjectListClauses = std::tuple<OmpClause::Copyin, @@ -86,4 +105,24 @@ const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init) { return nullptr; } +static void SplitOmpAllocateHelper( + OmpAllocateInfo &n, const OmpAllocateDirective &x) { + n.dirs.push_back(&x); + const Block &body{std::get<Block>(x.t)}; + if (!body.empty()) { + if (auto *omp{GetOmp(body.front())}) { + if (auto *ad{std::get_if<OmpAllocateDirective>(&omp->u)}) { + return SplitOmpAllocateHelper(n, *ad); + } + } + n.body = &body.front(); + } +} + +OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x) { + OmpAllocateInfo info; + SplitOmpAllocateHelper(info, x); + return info; +} + } // namespace Fortran::parser::omp diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index efce8fc3d2e35..8cccd84f9fa19 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -557,7 +557,7 @@ bool Prescanner::MustSkipToEndOfLine() const { return true; // skip over ignored columns in right margin (73:80) } else if (*at_ == '!' && !inCharLiteral_ && (!inFixedForm_ || tabInCurrentLine_ || column_ != 6)) { - return !IsCompilerDirectiveSentinel(at_ + 1); + return InCompilerDirective() || !IsCompilerDirectiveSentinel(at_ + 1); } else { return false; } diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index 9255c4e1136bc..84123030195e9 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2482,30 +2482,8 @@ class UnparseVisitor { Unparse(static_cast<const OmpBlockConstruct &>(x)); } - void Unparse(const OpenMPExecutableAllocate &x) { - const auto &fields = - std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>( - x.t); - if (fields) { - for (const auto &decl : *fields) { - Walk(decl); - } - } - BeginOpenMP(); - Word("!$OMP ALLOCATE"); - Walk(" (", std::get<std::optional<OmpObjectList>>(x.t), ")"); - Walk(std::get<OmpClauseList>(x.t)); - Put("\n"); - EndOpenMP(); - Walk(std::get<Statement<AllocateStmt>>(x.t)); - } - void Unparse(const OpenMPDeclarativeAllocate &x) { - BeginOpenMP(); - Word("!$OMP ALLOCATE"); - Walk(" (", std::get<std::optional<OmpObjectList>>(x.t), ")"); - Walk(std::get<OmpClauseList>(x.t)); - Put("\n"); - EndOpenMP(); + void Unparse(const OmpAllocateDirective &x) { + Unparse(static_cast<const OmpBlockConstruct &>(x)); } void Unparse(const OpenMPAllocatorsConstruct &x) { Unparse(static_cast<const OmpBlockConstruct &>(x)); diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp index c884658bf464a..a11c5250b1ab4 100644 --- a/flang/lib/Semantics/canonicalize-omp.cpp +++ b/flang/lib/Semantics/canonicalize-omp.cpp @@ -51,8 +51,6 @@ class CanonicalizationOfOmp { } // Block list } - void Post(parser::ExecutionPart &body) { RewriteOmpAllocations(body); } - // Pre-visit all constructs that have both a specification part and // an execution part, and store the connection between the two. bool Pre(parser::BlockConstruct &x) { @@ -88,6 +86,7 @@ class CanonicalizationOfOmp { void Post(parser::SpecificationPart &spec) { CanonicalizeUtilityConstructs(spec); + CanonicalizeAllocateDirectives(spec); } void Post(parser::OmpMapClause &map) { CanonicalizeMapModifiers(map); } @@ -239,33 +238,138 @@ class CanonicalizationOfOmp { } } - void RewriteOmpAllocations(parser::ExecutionPart &body) { - // Rewrite leading declarative allocations so they are nested - // within their respective executable allocate directive - // - // Original: - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - // ExecutionPartConstruct -> OpenMPExecutableAllocate - // - // After rewriting: - // ExecutionPartConstruct -> OpenMPExecutableAllocate - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - // ExecutionPartConstruct -> OpenMPDeclarativeAllocate - for (auto it = body.v.rbegin(); it != body.v.rend();) { - if (auto *exec = GetOmpIf<parser::OpenMPExecutableAllocate>(*(it++))) { - parser::OpenMPDeclarativeAllocate *decl; - std::list<parser::OpenMPDeclarativeAllocate> subAllocates; - while (it != body.v.rend() && - (decl = GetOmpIf<parser::OpenMPDeclarativeAllocate>(*it))) { - subAllocates.push_front(std::move(*decl)); - it = decltype(it)(body.v.erase(std::next(it).base())); + // Canonicalization of allocate directives + // + // In OpenMP 5.0 and 5.1 the allocate directive could either be a declarative + // one or an executable one. As usual in such cases, this poses a problem + // when the directive appears at the boundary between the specification part + // and the execution part. + // The executable form can actually consist of several adjacent directives, + // whereas the declarative form is always standalone. Additionally, the + // executable form must be associated with an allocate statement. + // + // The parser tries to parse declarative statements first, so in the + // following case, the two directives will be declarative, even though + // they should be treated as a single executable form: + // integer, allocatable :: x, y ! Specification + // !$omp allocate(x) + // !$omp allocate(y) + // allocate(x, y) ! Execution + // + void CanonicalizeAllocateDirectives(parser::SpecificationPart &spec) { + auto found = blockForSpec_.find(&spec); + if (found == blockForSpec_.end()) { + // There is no corresponding execution part, so there is nothing to do. + return; + } + parser::Block &block = *found->second; + + auto isAllocateStmt = [](const parser::ExecutionPartConstruct &epc) { + if (auto *ec = std::get_if<parser::ExecutableConstruct>(&epc.u)) { + if (auto *as = + std::get_if<parser::Statement<parser::ActionStmt>>(&ec->u)) { + return std::holds_alternative< + common::Indirection<parser::AllocateStmt>>(as->statement.u); + } + } + return false; + }; + + if (!block.empty() && isAllocateStmt(block.front())) { + // There are two places where an OpenMP declarative construct can + // show up in the tuple in specification part: + // (1) in std::list<OpenMPDeclarativeConstruct>, or + // (2) in std::list<DeclarationConstruct>. + // The case (1) is only possible if the list (2) is empty. + + auto &omps = + std::get<std::list<parser::OpenMPDeclarativeConstruct>>(spec.t); + auto &decls = std::get<std::list<parser::DeclarationConstruct>>(spec.t); + + if (!decls.empty()) { + MakeExecutableAllocateFromDecls(decls, block); + } else { + MakeExecutableAllocateFromOmps(omps, block); + } + } + } + + parser::ExecutionPartConstruct EmbedInExec( + parser::OmpAllocateDirective *alo, parser::ExecutionPartConstruct &&epc) { + // Nest current epc inside the allocate directive. + std::get<parser::Block>(alo->t).push_front(std::move(epc)); + // Set the new epc to be the ExecutionPartConstruct made from + // the allocate directive. + parser::OpenMPConstruct opc(std::move(*alo)); + common::Indirection<parser::OpenMPConstruct> ind(std::move(opc)); + parser::ExecutableConstruct ec(std::move(ind)); + return parser::ExecutionPartConstruct(std::move(ec)); + } + + void MakeExecutableAllocateFromDecls( + std::list<parser::DeclarationConstruct> &decls, parser::Block &body) { + using OpenMPDeclarativeConstruct = + common::Indirection<parser::OpenMPDeclarativeConstruct>; + + auto getAllocate = [](parser::DeclarationConstruct *dc) { + if (auto *sc = std::get_if<parser::SpecificationConstruct>(&dc->u)) { + if (auto *odc = std::get_if<OpenMPDeclarativeConstruct>(&sc->u)) { + if (auto *alo = + std::get_if<parser::OmpAllocateDirective>(&odc->value().u)) { + return alo; + } + } + } + return static_cast<parser::OmpAllocateDirective *>(nullptr); + }; + + std::list<parser::DeclarationConstruct>::reverse_iterator rlast = [&]() { + for (auto rit = decls.rbegin(), rend = decls.rend(); rit != rend; ++rit) { + if (getAllocate(&*rit) == nullptr) { + return rit; } - if (!subAllocates.empty()) { - std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>( - exec->t) = {std::move(subAllocates)}; + } + return decls.rend(); + }(); + + if (rlast != decls.rbegin()) { + // We have already checked that the first statement in body is + // ALLOCATE. + parser::ExecutionPartConstruct epc(std::move(body.front())); + for (auto rit = decls.rbegin(); rit != rlast; ++rit) { + epc = EmbedInExec(getAllocate(&*rit), std::move(epc)); + } + + body.pop_front(); + body.push_front(std::move(epc)); + decls.erase(rlast.base(), decls.end()); + } + } + + void MakeExecutableAllocateFromOmps( + std::list<parser::OpenMPDeclarativeConstruct> &omps, + parser::Block &body) { + using OpenMPDeclarativeConstruct = parser::OpenMPDeclarativeConstruct; + + std::list<OpenMPDeclarativeConstruct>::reverse_iterator rlast = [&]() { + for (auto rit = omps.rbegin(), rend = omps.rend(); rit != rend; ++rit) { + if (!std::holds_alternative<parser::OmpAllocateDirective>(rit->u)) { + return rit; } } + return omps.rend(); + }(); + + if (rlast != omps.rbegin()) { + parser::ExecutionPartConstruct epc(std::move(body.front())); + for (auto rit = omps.rbegin(); rit != rlast; ++rit) { + epc = EmbedInExec( + &std::get<parser::OmpAllocateDirective>(rit->u), std::move(epc)); + } + + body.pop_front(); + body.push_front(std::move(epc)); + omps.erase(rlast.base(), omps.end()); } } diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 3ea8e5b8cd2b0..d7db15dd37949 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -179,29 +179,21 @@ void OmpStructureChecker::Leave(const parser::BlockConstruct &x) { } } -// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'. -#define CHECK_SIMPLE_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \ - CheckAllowedClause(llvm::omp::Clause::Y); \ - } +void OmpStructureChecker::Enter(const parser::SpecificationPart &) { + partStack_.push_back(PartKind::SpecificationPart); +} -#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ - CheckAllowedClause(llvm::omp::Clause::Y); \ - RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \ - } +void OmpStructureChecker::Leave(const parser::SpecificationPart &) { + partStack_.pop_back(); +} -#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ - CheckAllowedClause(llvm::omp::Clause::Y); \ - RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \ - } +void OmpStructureChecker::Enter(const parser::ExecutionPart &) { + partStack_.push_back(PartKind::ExecutionPart); +} -// Use when clause don't falls under 'struct OmpClause' in 'parse-tree.h'. -#define CHECK_SIMPLE_PARSER_CLAUSE(X, Y) \ - void OmpStructureChecker::Enter(const parser::X &) { \ - CheckAllowedClause(llvm::omp::Y); \ - } +void OmpStructureChecker::Leave(const parser::ExecutionPart &) { + partStack_.pop_back(); +} // 'OmpWorkshareBlockChecker' is used to check the validity of the assignment // statements and the expressions enclosed in an OpenMP Workshare construct @@ -720,18 +712,10 @@ template <typename Checker> struct DirectiveSpellingVisitor { return std::get<parser::OmpBeginDirective>(t).DirName(); } - bool Pre(const parser::OpenMPDeclarativeAllocate &x) { - checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocate); - return false; - } bool Pre(const parser::OpenMPDispatchConstruct &x) { checker_(GetDirName(x.t).source, Directive::OMPD_dispatch); return false; } - bool Pre(const parser::OpenMPExecutableAllocate &x) { - checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocate); - return false; - } bool Pre(const parser::OpenMPAllocatorsConstruct &x) { checker_(GetDirName(x.t).source, Directive::OMPD_allocators); return false; @@ -1667,11 +1651,6 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) { dirContext_.pop_back(); } -static std::pair<const parser::AllocateStmt *, parser::CharBlock> -getAllocateStmtAndSource(const parser::Statement<parser::AllocateStmt> &stmt) { - return {&stmt.statement, stmt.source}; -} - static std::pair<const parser::AllocateStmt *, parser::CharBlock> getAllocateStmtAndSource(const parser::ExecutionPartConstruct *epc) { if (SourcedActionStmt as{GetActionStmt(epc)}) { @@ -1699,19 +1678,12 @@ static UnorderedSymbolSet GetNonComponentSymbols( return symbols; } -static const parser::OmpObjectList &GetObjectsOrEmpty( - const std::optional<parser::OmpObjectList> &maybeObjects) { - static parser::OmpObjectList empty{std::list<parser::OmpObject>{}}; - if (maybeObjects) { - return *maybeObjects; - } - return empty; -} +void OmpStructureChecker::CheckIndividualAllocateDirective( + const parser::OmpAllocateDirective &x, bool isExecutable) { + const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()}; + const parser::OmpDirectiveName &dirName{beginSpec.DirName()}; -void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, - const parser::OmpObjectList &objects, - const parser::OmpClauseList &clauses) { - const Scope &thisScope{context_.FindScope(source)}; + const Scope &thisScope{context_.FindScope(dirName.source)}; auto maybeHasPredefinedAllocator{[&](const parser::OmpClause *calloc) { // Return "true" if the ALLOCATOR clause was provided with an argument @@ -1740,7 +1712,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, const auto *allocator{[&]() { // Can't use FindClause in Enter (because clauses haven't been visited // yet). - for (const parser::OmpClause &c : clauses.v) { + for (const parser::OmpClause &c : beginSpec.Clauses().v) { if (c.Id() == llvm::omp::Clause::OMPC_allocator) { return &c; } @@ -1752,7 +1724,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, bool hasDynAllocators{ HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)}; if (!allocator && !hasDynAllocators) { - context_.Say(source, + context_.Say(dirName.source, "An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US); } } @@ -1766,7 +1738,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, : "a named common block or has SAVE attribute"}; auto checkSymbol{[&](const Symbol &symbol, parser::CharBlock source) { - if (!inExecutableAllocate_) { + if (!isExecutable) { // For structure members, the scope is the derived type, which is // never "this" scope. Ignore this check for members, they will be // flagged anyway. @@ -1802,37 +1774,130 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source, } }}; - for (const parser::OmpObject &object : objects.v) { - parser::CharBlock objSource{[&]() { - if (auto &&maybeSource{GetObjectSource(object)}) { - return *maybeSource; - } - return source; - }()}; - if (const Symbol *symbol{GetObjectSymbol(object)}) { + for (const parser::OmpArgument &arg : beginSpec.Arguments().v) { + const parser::OmpObject *object{GetArgumentObject(arg)}; + if (!object) { + context_.Say(arg.source, + "An argument to ALLOCATE directive must be a variable list item"_err_en_US); + continue; + } + + if (const Symbol *symbol{GetObjectSymbol(*object)}) { if (!IsTypeParamInquiry(*symbol)) { - checkSymbol(*symbol, objSource); + checkSymbol(*symbol, arg.source); } - CheckVarIsNotPartOfAnotherVar(source, object); + CheckVarIsNotPartOfAnotherVar(dirName.source, *object); } } } -void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) { - const auto &dir{std::get<parser::Verbatim>(x.t)}; - PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate); +void OmpStructureChecker::CheckExecutableAllocateDirective( + const parser::OmpAllocateDirective &x) { + parser::omp::OmpAllocateInfo info{SplitOmpAllocate(x)}; + + auto [allocStmt, allocSource]{getAllocateStmtAndSource(info.body)}; + if (!allocStmt) { + // This has been diagnosed already. + return; + } - if (!inExecutableAllocate_) { - const auto &dir{std::get<parser::Verbatim>(x.t)}; - const auto &clauses{std::get<parser::OmpClauseList>(x.t)}; - const auto &objects{ - GetObjectsOrEmpty(std::get<std::optional<parser::OmpObjectList>>(x.t))}; + UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)}; + SymbolSourceMap directiveSyms; + bool hasEmptyList{false}; + + for (const parser::OmpAllocateDirective *ompAlloc : info.dirs) { + const parser::OmpDirectiveSpecification &spec{DEREF(ompAlloc).BeginDir()}; + if (spec.Arguments().v.empty()) { + if (hasEmptyList && info.dirs.size() > 1) { + context_.Say(spec.DirName().source, + "If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items"_err_en_US); + } + hasEmptyList = true; + } + for (const parser::OmpArgument &arg : spec.Arguments().v) { + if (auto *sym{GetArgumentSymbol(arg)}) { + // Ignore these checks for structure members. They are not allowed + // in the first place, so don't tell the users that they need to + // be specified somewhere, + if (IsStructureComponent(*sym)) { + continue; + } + if (auto f{directiveSyms.find(sym)}; f != directiveSyms.end()) { + parser::MessageFormattedText txt( + "A list item on an executable ALLOCATE may only be specified once"_err_en_US); + parser::Message message(arg.source, txt); + message.Attach(f->second, "The list item was specified here"_en_US); + context_.Say(std::move(message)); + } else { + directiveSyms.insert(std::make_pair(sym, arg.source)); + } - CheckAllocateDirective(dir.source, objects, clauses); + if (auto f{allocateSyms.find(*sym)}; f == allocateSyms.end()) { + context_ + .Say(arg.source, + "A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement"_err_en_US) + .Attach(allocSource, "The ALLOCATE statement"_en_US); + } + } + } } } -void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) { +void OmpStructureChecker::Enter(const parser::OmpAllocateDirective &x) { + const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()}; + const parser::OmpDirectiveName &dirName{beginSpec.DirName()}; + PushContextAndClauseSets(dirName.source, dirName.v); + ++allocateDirectiveLevel; + + bool isExecutable{partStack_.back() == PartKind::ExecutionPart}; + + unsigned version{context_.langOptions().OpenMPVersion}; + if (isExecutable && allocateDirectiveLevel == 1 && version >= 52) { + context_.Warn(common::UsageWarning::OpenMPUsage, dirName.source, + "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US); + } + + CheckIndividualAllocateDirective(x, isExecutable); + + if (isExecutable) { + auto isOmpAllocate{[](const parser::ExecutionPartConstruct &epc) { + if (auto *omp{GetOmp(epc)}) { + auto odn{GetOmpDirectiveName(*omp)}; + return odn.v == llvm::omp::Directive::OMPD_allocate; + } + return false; + }}; + + auto &body{std::get<parser::Block>(x.t)}; + // The parser should put at most one statement in the body. + assert(body.size() <= 1 && "Multiple statements in allocate"); + if (body.empty()) { + context_.Say(dirName.source, + "An executable ALLOCATE directive must be associated with an ALLOCATE statement"_err_en_US); + } else { + const parser::ExecutionPartConstruct &first{body.front()}; + auto [allocStmt, _]{getAllocateStmtAndSource(&body.front())}; + if (!isOmpAllocate(first) && !allocStmt) { + parser::CharBlock source{[&]() { + if (auto &&maybeSource{parser::GetSource(first)}) { + return *maybeSource; + } + return dirName.source; + }()}; + context_.Say(source, + "The statement associated with executable ALLOCATE directive must be an ALLOCATE statement"_err_en_US); + } + } + } +} + +void OmpStructureChecker::Leave(const parser::OmpAllocateDirective &x) { + bool isExecutable{partStack_.back() == PartKind::ExecutionPart}; + if (isExecutable && allocateDirectiveLevel == 1) { + CheckExecutableAllocateDirective(x); + } + + --allocateDirectiveLevel; dirContext_.pop_back(); } @@ -2135,112 +2200,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::At &x) { } } -void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) { - inExecutableAllocate_ = true; - const auto &dir{std::get<parser::Verbatim>(x.t)}; - PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate); - - unsigned version{context_.langOptions().OpenMPVersion}; - if (version >= 52) { - context_.Warn(common::UsageWarning::OpenMPUsage, x.source, - "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US); - } - - auto &objects{ - GetObjectsOrEmpty(std::get<std::optional<parser::OmpObjectList>>(x.t))}; - auto &clauses{std::get<parser::OmpClauseList>(x.t)}; - - CheckAllocateDirective( - std::get<parser::Verbatim>(x.t).source, objects, clauses); - - if (const auto &subDirs{ - std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>( - x.t)}) { - for (const auto &dalloc : *subDirs) { - const auto &dir{std::get<parser::Verbatim>(x.t)}; - const auto &clauses{std::get<parser::OmpClauseList>(dalloc.t)}; - const auto &objects{GetObjectsOrEmpty( - std::get<std::optional<parser::OmpObjectList>>(dalloc.t))}; - CheckAllocateDirective(dir.source, objects, clauses); - } - } -} - -void OmpStructureChecker::Leave(const parser::OpenMPExecutableAllocate &x) { - auto [allocStmt, allocSource]{getAllocateStmtAndSource( - std::get<parser::Statement<parser::AllocateStmt>>(x.t))}; - - UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)}; - SymbolSourceMap directiveSyms; - auto &objects{ - GetObjectsOrEmpty(std::get<std::optional<parser::OmpObjectList>>(x.t))}; - auto emptyListCount{static_cast<size_t>(objects.v.empty())}; - auto checkObjects{[&](const parser::OmpObjectList &objects, - parser::CharBlock dirSource, - parser::CharBlock allocSource) { - for (const parser::OmpObject &object : objects.v) { - parser::CharBlock objSource{[&]() { - if (auto &&maybeSource{GetObjectSource(object)}) { - return *maybeSource; - } - return dirSource; - }()}; - if (auto *sym{GetObjectSymbol(object)}) { - // Ignore these checks for structure members. They are not allowed - // in the first place, so don't tell the users that they nened to - // be specified somewhere, - if (IsStructureComponent(*sym)) { - continue; - } - if (auto f{directiveSyms.find(sym)}; f != directiveSyms.end()) { - parser::MessageFormattedText txt( - "A list item on an executable ALLOCATE may only be specified once"_err_en_US); - parser::Message message(objSource, txt); - message.Attach(f->second, "The list item was specified here"_en_US); - context_.Say(std::move(message)); - } else { - directiveSyms.insert(std::make_pair(sym, objSource)); - } - - if (auto f{allocateSyms.find(*sym)}; f == allocateSyms.end()) { - context_ - .Say(objSource, - "A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement"_err_en_US) - .Attach(allocSource, "The ALLOCATE statement"_en_US); - } - } - } - }}; - - checkObjects(objects, std::get<parser::Verbatim>(x.t).source, allocSource); - - const auto &subDirs{ - std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>( - x.t)}; - if (!subDirs) { - inExecutableAllocate_ = false; - dirContext_.pop_back(); - return; - } - - for (const parser::OpenMPDeclarativeAllocate &ompAlloc : *subDirs) { - parser::CharBlock dirSource{std::get<parser::Verbatim>(ompAlloc.t).source}; - auto &objects{GetObjectsOrEmpty( - std::get<std::optional<parser::OmpObjectList>>(ompAlloc.t))}; - if (objects.v.empty()) { - // Only show the message once per construct. - if (++emptyListCount == 2 && subDirs->size() >= 1) { - context_.Say(dirSource, - "If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items"_err_en_US); - } - } - checkObjects(objects, dirSource, allocSource); - } - - inExecutableAllocate_ = false; - dirContext_.pop_back(); -} - void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) { const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()}; const parser::OmpDirectiveName &dirName{beginSpec.DirName()}; @@ -3408,88 +3367,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Sizes &c) { /*paramName=*/"parameter", /*allowZero=*/false); } -// Following clauses do not have a separate node in parse-tree.h. -CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent) -CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity) -CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture) -CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains) -CHECK_SIMPLE_CLAUSE(Default, OMPC_default) -CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj) -CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type) -CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule) -CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate) -CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive) -CHECK_SIMPLE_CLAUSE(Final, OMPC_final) -CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush) -CHECK_SIMPLE_CLAUSE(Full, OMPC_full) -CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize) -CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id) -CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset) -CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds) -CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive) -CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer) -CHECK_SIMPLE_CLAUSE(Match, OMPC_match) -CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal) -CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks) -CHECK_SIMPLE_CLAUSE(Order, OMPC_order) -CHECK_SIMPLE_CLAUSE(Read, OMPC_read) -CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate) -CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate) -CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads) -CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset) -CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch) -CHECK_SIMPLE_CLAUSE(Link, OMPC_link) -CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect) -CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable) -CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp) -CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines) -CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs) -CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism) -CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup) -CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch) -CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial) -CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind) -CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd) -CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation) -CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform) -CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown) -CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied) -CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators) -CHECK_SIMPLE_CLAUSE(Write, OMPC_write) -CHECK_SIMPLE_CLAUSE(Init, OMPC_init) -CHECK_SIMPLE_CLAUSE(Use, OMPC_use) -CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants) -CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext) -CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity) -CHECK_SIMPLE_CLAUSE(Message, OMPC_message) -CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter) -CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise) -CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args) -CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args) -CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order) -CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind) -CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare) -CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute) -CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak) -CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel) -CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire) -CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed) -CHECK_SIMPLE_CLAUSE(Release, OMPC_release) -CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable) -CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent) -CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst) -CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail) - -CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams) -CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads) -CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem) -CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority) -CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit) - -CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse) -CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen) -CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen) - void OmpStructureChecker::Enter(const parser::OmpClause::Looprange &x) { context_.Say(GetContext().clauseSource, "LOOPRANGE clause is not implemented yet"_err_en_US, @@ -5562,4 +5439,104 @@ void OmpStructureChecker::CheckAllowedRequiresClause(llvmOmpClause clause) { } } +// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'. +#define CHECK_SIMPLE_CLAUSE(X, Y) \ + void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \ + CheckAllowedClause(llvm::omp::Clause::Y); \ + } + +#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \ + void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ + CheckAllowedClause(llvm::omp::Clause::Y); \ + RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \ + } + +#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \ + void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \ + CheckAllowedClause(llvm::omp::Clause::Y); \ + RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \ + } + +// Following clauses do not have a separate node in parse-tree.h. +CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent) +CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel) +CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire) +CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args) +CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity) +CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args) +CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind) +CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture) +CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare) +CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains) +CHECK_SIMPLE_CLAUSE(Default, OMPC_default) +CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj) +CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type) +CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule) +CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate) +CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive) +CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail) +CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter) +CHECK_SIMPLE_CLAUSE(Final, OMPC_final) +CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush) +CHECK_SIMPLE_CLAUSE(Full, OMPC_full) +CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize) +CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id) +CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset) +CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate) +CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds) +CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch) +CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive) +CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect) +CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer) +CHECK_SIMPLE_CLAUSE(Init, OMPC_init) +CHECK_SIMPLE_CLAUSE(Link, OMPC_link) +CHECK_SIMPLE_CLAUSE(Match, OMPC_match) +CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order) +CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable) +CHECK_SIMPLE_CLAUSE(Message, OMPC_message) +CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext) +CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup) +CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal) +CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs) +CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp) +CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines) +CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism) +CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch) +CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants) +CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks) +CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute) +CHECK_SIMPLE_CLAUSE(Order, OMPC_order) +CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise) +CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial) +CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation) +CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind) +CHECK_SIMPLE_CLAUSE(Read, OMPC_read) +CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed) +CHECK_SIMPLE_CLAUSE(Release, OMPC_release) +CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable) +CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst) +CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity) +CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd) +CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate) +CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset) +CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads) +CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent) +CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform) +CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown) +CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied) +CHECK_SIMPLE_CLAUSE(Use, OMPC_use) +CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators) +CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak) +CHECK_SIMPLE_CLAUSE(Write, OMPC_write) + +CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams) +CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads) +CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem) +CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority) +CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit) + +CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse) +CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen) +CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen) + } // namespace Fortran::semantics diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 6feb1d149c4fd..1b84bc5dda471 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -82,6 +82,11 @@ class OmpStructureChecker : public OmpStructureCheckerBase { bool Enter(const parser::BlockConstruct &); void Leave(const parser::BlockConstruct &); + void Enter(const parser::SpecificationPart &); + void Leave(const parser::SpecificationPart &); + void Enter(const parser::ExecutionPart &); + void Leave(const parser::ExecutionPart &); + void Enter(const parser::OpenMPConstruct &); void Leave(const parser::OpenMPConstruct &); void Enter(const parser::OpenMPInteropConstruct &); @@ -113,8 +118,8 @@ class OmpStructureChecker : public OmpStructureCheckerBase { void Leave(const parser::OmpDeclareVariantDirective &); void Enter(const parser::OpenMPDeclareSimdConstruct &); void Leave(const parser::OpenMPDeclareSimdConstruct &); - void Enter(const parser::OpenMPDeclarativeAllocate &); - void Leave(const parser::OpenMPDeclarativeAllocate &); + void Enter(const parser::OmpAllocateDirective &); + void Leave(const parser::OmpAllocateDirective &); void Enter(const parser::OpenMPDeclareMapperConstruct &); void Leave(const parser::OpenMPDeclareMapperConstruct &); void Enter(const parser::OpenMPDeclareReductionConstruct &); @@ -129,8 +134,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase { void Leave(const parser::OmpErrorDirective &); void Enter(const parser::OmpNothingDirective &); void Leave(const parser::OmpNothingDirective &); - void Enter(const parser::OpenMPExecutableAllocate &); - void Leave(const parser::OpenMPExecutableAllocate &); void Enter(const parser::OpenMPAllocatorsConstruct &); void Leave(const parser::OpenMPAllocatorsConstruct &); void Enter(const parser::OpenMPRequiresConstruct &); @@ -263,9 +266,9 @@ class OmpStructureChecker : public OmpStructureCheckerBase { bool CheckTargetBlockOnlyTeams(const parser::Block &); void CheckWorkshareBlockStmts(const parser::Block &, parser::CharBlock); void CheckWorkdistributeBlockStmts(const parser::Block &, parser::CharBlock); - void CheckAllocateDirective(parser::CharBlock source, - const parser::OmpObjectList &objects, - const parser::OmpClauseList &clauses); + void CheckIndividualAllocateDirective( + const parser::OmpAllocateDirective &x, bool isExecutable); + void CheckExecutableAllocateDirective(const parser::OmpAllocateDirective &x); void CheckIteratorRange(const parser::OmpIteratorSpecifier &x); void CheckIteratorModifier(const parser::OmpIterator &x); @@ -373,7 +376,7 @@ class OmpStructureChecker : public OmpStructureCheckerBase { }; int directiveNest_[LastType + 1] = {0}; - bool inExecutableAllocate_{false}; + int allocateDirectiveLevel{0}; parser::CharBlock visitedAtomicSource_; SymbolSourceMap deferredNonVariables_; @@ -382,6 +385,14 @@ class OmpStructureChecker : public OmpStructureCheckerBase { std::vector<LoopConstruct> loopStack_; // Scopes for scoping units. std::vector<const Scope *> scopeStack_; + + enum class PartKind : int { + // There are also other "parts", such as internal-subprogram-part, etc, + // but we're keeping track of these two for now. + SpecificationPart, + ExecutionPart, + }; + std::vector<PartKind> partStack_; }; /// Find a duplicate entry in the range, and return an iterator to it. diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 03c8cb0065fd8..deb57e005a352 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -415,6 +415,18 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> { return true; } + bool Pre(const parser::SpecificationPart &) { + partStack_.push_back(PartKind::SpecificationPart); + return true; + } + void Post(const parser::SpecificationPart &) { partStack_.pop_back(); } + + bool Pre(const parser::ExecutionPart &) { + partStack_.push_back(PartKind::ExecutionPart); + return true; + } + void Post(const parser::ExecutionPart &) { partStack_.pop_back(); } + bool Pre(const parser::InternalSubprogram &) { // Clear the labels being tracked in the previous scope ClearLabels(); @@ -639,8 +651,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> { bool Pre(const parser::OpenMPThreadprivate &); void Post(const parser::OpenMPThreadprivate &) { PopContext(); } - bool Pre(const parser::OpenMPDeclarativeAllocate &); - void Post(const parser::OpenMPDeclarativeAllocate &) { PopContext(); } + bool Pre(const parser::OmpAllocateDirective &); bool Pre(const parser::OpenMPAssumeConstruct &); void Post(const parser::OpenMPAssumeConstruct &) { PopContext(); } @@ -651,9 +662,6 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> { bool Pre(const parser::OpenMPDispatchConstruct &); void Post(const parser::OpenMPDispatchConstruct &) { PopContext(); } - bool Pre(const parser::OpenMPExecutableAllocate &); - void Post(const parser::OpenMPExecutableAllocate &); - bool Pre(const parser::OpenMPAllocatorsConstruct &); void Post(const parser::OpenMPAllocatorsConstruct &); @@ -998,6 +1006,14 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> { targetLabels_; parser::CharBlock currentStatementSource_; + enum class PartKind : int { + // There are also other "parts", such as internal-subprogram-part, etc, + // but we're keeping track of these two for now. + SpecificationPart, + ExecutionPart, + }; + std::vector<PartKind> partStack_; + void AddAllocateName(const parser::Name *&object) { allocateNames_.push_back(object); } @@ -2558,11 +2574,24 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) { return true; } -bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclarativeAllocate &x) { +bool OmpAttributeVisitor::Pre(const parser::OmpAllocateDirective &x) { PushContext(x.source, llvm::omp::Directive::OMPD_allocate); - if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) { - ResolveOmpObjectList(*list, Symbol::Flag::OmpDeclarativeAllocateDirective); + assert(!partStack_.empty() && "Misplaced directive"); + + auto ompFlag{partStack_.back() == PartKind::SpecificationPart + ? Symbol::Flag::OmpDeclarativeAllocateDirective + : Symbol::Flag::OmpExecutableAllocateDirective}; + + parser::omp::OmpAllocateInfo info{parser::omp::SplitOmpAllocate(x)}; + for (const parser::OmpAllocateDirective *ad : info.dirs) { + for (const parser::OmpArgument &arg : ad->BeginDir().Arguments().v) { + if (auto *object{omp::GetArgumentObject(arg)}) { + ResolveOmpObject(*object, ompFlag); + } + } } + + PopContext(); return false; } @@ -2581,15 +2610,6 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDispatchConstruct &x) { return true; } -bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) { - PushContext(x.source, llvm::omp::Directive::OMPD_allocate); - const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}; - if (list) { - ResolveOmpObjectList(*list, Symbol::Flag::OmpExecutableAllocateDirective); - } - return true; -} - bool OmpAttributeVisitor::Pre(const parser::OpenMPAllocatorsConstruct &x) { const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()}; PushContext(x.source, dirSpec.DirId()); @@ -2661,10 +2681,6 @@ bool OmpAttributeVisitor::IsNestedInDirective(llvm::omp::Directive directive) { return false; } -void OmpAttributeVisitor::Post(const parser::OpenMPExecutableAllocate &x) { - PopContext(); -} - void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) { PopContext(); } diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 220f1c96b9823..a2062ef28d52c 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -1700,12 +1700,12 @@ class OmpVisitor : public virtual DeclarationVisitor { void Post(const parser::OpenMPDeclareTargetConstruct &) { SkipImplicitTyping(false); } - bool Pre(const parser::OpenMPDeclarativeAllocate &x) { + bool Pre(const parser::OmpAllocateDirective &x) { AddOmpSourceRange(x.source); SkipImplicitTyping(true); return true; } - void Post(const parser::OpenMPDeclarativeAllocate &) { + void Post(const parser::OmpAllocateDirective &) { SkipImplicitTyping(false); messageHandler().set_currStmtSource(std::nullopt); } diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 59af58ddcd32e..27097193aaa9b 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1171,6 +1171,45 @@ attributes(device) pure integer(8) function atomicaddl(address, val) integer(8), intent(inout) :: address integer(8), value :: val end function + attributes(device) pure integer(4) function atomicaddr2(address, val) + !dir$ ignore_tkr (rd) address, (d) val + real(2), dimension(2), intent(inout) :: address + real(2), dimension(2), intent(in) :: val + end function + end interface + + interface atomicaddvector + attributes(device) pure function atomicaddvector_r2x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(2), dimension(2), intent(inout) :: address + real(2), dimension(2), intent(in) :: val + real(2), dimension(2) :: z + end function + + attributes(device) pure function atomicaddvector_r4x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(2), intent(inout) :: address + real(4), dimension(2), intent(in) :: val + real(4), dimension(2) :: z + end function + end interface + + interface atomicaddreal4x2 + attributes(device) pure function atomicadd_r4x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(2), intent(inout) :: address + real(4), dimension(2), intent(in) :: val + real(4), dimension(2) :: z + end function + end interface + + interface atomicaddreal4x4 + attributes(device) pure function atomicadd_r4x4(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(4), intent(inout) :: address + real(4), dimension(4), intent(in) :: val + real(4), dimension(4) :: z + end function end interface interface atomicsub diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang/test/Driver/gcc-triple.f90 b/flang/test/Driver/gcc-triple.f90 new file mode 100644 index 0000000000000..324311febf157 --- /dev/null +++ b/flang/test/Driver/gcc-triple.f90 @@ -0,0 +1,18 @@ +!! UNSUPPORTED: system-windows + +!! Test that --gcc-triple option is working as expected. + +! RUN: %flang --target=x86_64-linux-gnu -v --sysroot=%S/Inputs/fedora_39_tree 2>&1 | FileCheck %s --dump-input=always --check-prefix=DEFAULT_TRIPLE +! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation: +! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13 +! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation: +! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13 +! DEFAULT_TRIPLE: {{^}}Selected GCC installation: +! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13 + +! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-redhat-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_EXISTS +! TRIPLE_EXISTS: {{^}}Selected GCC installation: +! TRIPLE_EXISTS: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13 + +! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-foo-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_DOES_NOT_EXISTS +! TRIPLE_DOES_NOT_EXISTS-NOT: x86_64-foo-linux \ No newline at end of file diff --git a/flang/test/Lower/CUDA/cuda-atomicadd.cuf b/flang/test/Lower/CUDA/cuda-atomicadd.cuf new file mode 100644 index 0000000000000..6669b4afa291d --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-atomicadd.cuf @@ -0,0 +1,35 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +! Test CUDA Fortran atmoicadd functions available cudadevice module + +attributes(global) subroutine test_atomicaddvector_r2() + real(2), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicAddVector(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16> + +attributes(global) subroutine test_atomicaddvector_r4() + real(4), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicAddVector(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32> + +attributes(global) subroutine test_atomicadd_r2x4() + real(4), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicaddreal4x2(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicadd_r2x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32> + +attributes(global) subroutine test_atomicadd_r4x4() + real(4), device :: a(4), tmp1(4), tmp2(4) + tmp1 = atomicaddreal4x4(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_atomicadd_r4x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<4xf32> diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 09b4302446ee7..2d2c801b48f4d 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -14,6 +14,8 @@ attributes(global) subroutine devsub() integer :: smalltime integer(4) :: res, offset integer(8) :: resl + real(2) :: r2a(2) + real(2) :: tmp2(2) integer :: tid tid = threadIdx%x @@ -34,6 +36,7 @@ attributes(global) subroutine devsub() al = atomicadd(al, 1_8) af = atomicadd(af, 1.0_4) ad = atomicadd(ad, 1.0_8) + ai = atomicadd(r2a, tmp2) ai = atomicsub(ai, 1_4) al = atomicsub(al, 1_8) @@ -128,6 +131,7 @@ end ! CHECK: %{{.*}} = llvm.atomicrmw add %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64 +! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16> ! CHECK: %{{.*}} = llvm.atomicrmw sub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 ! CHECK: %{{.*}} = llvm.atomicrmw sub %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64 @@ -436,11 +440,11 @@ end subroutine ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> -! CHECK: %{{.*}} = nvvm.mbarrier.arrive.shared %[[SHARED_PTR]] : !llvm.ptr<3> -> i64 +! CHECK: %{{.*}} = nvvm.mbarrier.arrive %[[SHARED_PTR]] : !llvm.ptr<3> -> i64 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3> -! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %{{.*}}, [%{{.*}}], %{{.*}};" ro(%{{.*}}, %{{.*}} : !llvm.ptr<3>, i32) -> i64 attributes(global) subroutine test_fence() @@ -490,7 +494,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_bulk_s2g ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(device) subroutine testAtomicCasLoop(aa, n) @@ -515,7 +519,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait() ! CHECK: scf.while -! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A .reg .pred p;\0A mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 attributes(global) subroutine test_barrier_try_wait_sleep() integer :: istat @@ -526,7 +530,7 @@ attributes(global) subroutine test_barrier_try_wait_sleep() end subroutine ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep() -! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 +! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A .reg .pred p;\0A mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32 attributes(global) subroutine test_tma_bulk_load_c4(a, n) integer(8), shared :: barrier1 @@ -671,7 +675,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_c8(c, n) @@ -684,7 +688,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_i4(c, n) @@ -697,7 +701,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_i8(c, n) @@ -710,7 +714,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 @@ -724,7 +728,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_r4(c, n) @@ -737,7 +741,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 attributes(global) subroutine test_tma_bulk_store_r8(c, n) @@ -750,5 +754,5 @@ end subroutine ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3> -! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group" +! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;" ! CHECK: nvvm.cp.async.bulk.wait_group 0 diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 index 8daf20e1ae400..fec146ac70313 100644 --- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 +++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 @@ -5,6 +5,6 @@ program main integer :: x - ! CHECK: not yet implemented: OpenMPDeclarativeAllocate + ! CHECK: not yet implemented: OmpAllocateDirective !$omp allocate(x) align(32) end diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 index e83b433d0fda0..3307eb2505b71 100644 --- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 +++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 @@ -5,6 +5,6 @@ program main integer :: x, y - ! CHECK: not yet implemented: OpenMPDeclarativeAllocate + ! CHECK: not yet implemented: OmpAllocateDirective !$omp allocate(x, y) end diff --git a/flang/test/Parser/OpenMP/allocate-align-tree.f90 b/flang/test/Parser/OpenMP/allocate-align-tree.f90 index 0d247cd1ed945..d799aa10a82ff 100644 --- a/flang/test/Parser/OpenMP/allocate-align-tree.f90 +++ b/flang/test/Parser/OpenMP/allocate-align-tree.f90 @@ -16,27 +16,33 @@ program allocate_align_tree allocate(j(z), xarray(t)) end program allocate_align_tree -!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt -!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> -!CHECK-NEXT: | | | AttrSpec -> Allocatable -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'j' +!CHECK: DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt +!CHECK-NEXT: | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> +!CHECK-NEXT: | AttrSpec -> Allocatable +!CHECK-NEXT: | EntityDecl +!CHECK-NEXT: | | Name = 'j' +!CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | OmpBeginDirective +!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'j' +!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4' +!CHECK-NEXT: | | | LiteralConstant -> IntLiteralConstant = '16' +!CHECK-NEXT: | | Flags = None +!CHECK-NEXT: | Block +!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4' +!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '32' +!CHECK-NEXT: | | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block +!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt -!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray' -!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4' -!CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '32' -!CHECK-NEXT: | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' -!CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'j' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4' -!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '16' -!CHECK-NEXT: | | | AllocateStmt +!UNPARSE: !$OMP ALLOCATE(j) ALIGN(16_4) +!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALIGN(32_4) ALLOCATOR(2_8) +!UNPARSE-NEXT: ALLOCATE(j(z), xarray(t)) -!UNPARSE: !$OMP ALLOCATE (j) ALIGN(16_4) -!UNPARSE: !$OMP ALLOCATE (xarray) ALIGN(32_4) ALLOCATOR(2_8) -!UNPARSE-NEXT: ALLOCATE(j(z), xarray(t)) diff --git a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 index afcaf44b09f03..800e4a57d5f0e 100644 --- a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 +++ b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 @@ -17,33 +17,48 @@ program allocate_tree allocate (w, xarray(4), zarray(5, f)) end program allocate_tree -!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'f' -!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | Designator -> DataRef -> Name = +!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'f' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block !CHECK-NEXT: | ExecutionPart -> Block !CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'f=2_4' !CHECK-NEXT: | | | Variable = 'f' !CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'f' !CHECK-NEXT: | | | Expr = '2_4' !CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '2' -!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpClauseList -> -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | AllocateStmt +!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block +!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray' +!CHECK-NEXT: | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' +!CHECK-NEXT: | | | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' +!CHECK-NEXT: | | | | | | Flags = None +!CHECK-NEXT: | | | | | Block +!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray' +!CHECK-NEXT: | | | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8' +!CHECK-NEXT: | | | | | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc' +!CHECK-NEXT: | | | | | | | | Flags = None +!CHECK-NEXT: | | | | | | | Block +!CHECK-NEXT: | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | | | | | OmpClauseList -> +!CHECK-NEXT: | | | | | | | | | | Flags = None +!CHECK-NEXT: | | | | | | | | | Block +!CHECK-NEXT: | | | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt diff --git a/flang/test/Parser/OpenMP/allocate-tree.f90 b/flang/test/Parser/OpenMP/allocate-tree.f90 index bf413d591baf2..021d8104a7e62 100644 --- a/flang/test/Parser/OpenMP/allocate-tree.f90 +++ b/flang/test/Parser/OpenMP/allocate-tree.f90 @@ -7,52 +7,54 @@ program allocate_tree use omp_lib - integer, allocatable :: w, xarray(:), zarray(:, :) - integer :: z, t + integer, allocatable :: xarray(:), zarray(:, :) + integer :: z, t, w +!$omp allocate(w) allocator(omp_const_mem_alloc) t = 2 z = 3 -!$omp allocate(w) allocator(omp_const_mem_alloc) !$omp allocate(xarray) allocator(omp_large_cap_mem_alloc) !$omp allocate(zarray) allocator(omp_default_mem_alloc) !$omp allocate - allocate(w, xarray(4), zarray(t, z)) + allocate(xarray(4), zarray(t, z)) end program allocate_tree -!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt -!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> -!CHECK-NEXT: | | | AttrSpec -> Allocatable -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'w' -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'xarray' -!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '1' -!CHECK-NEXT: | | | EntityDecl -!CHECK-NEXT: | | | | Name = 'zarray' -!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '2' - +!CHECK: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective +!CHECK-NEXT: | OmpBeginDirective +!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w' +!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8' +!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc' +!CHECK-NEXT: | | Flags = None +!CHECK-NEXT: | Block -!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate -!CHECK-NEXT: | | | Verbatim -!CHECK-NEXT: | | | OmpClauseList -> -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | OpenMPDeclarativeAllocate -!CHECK-NEXT: | | | | Verbatim -!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray' -!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = -!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = -!CHECK-NEXT: | | | AllocateStmt +!CHECK: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | OmpBeginDirective +!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray' +!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8' +!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc' +!CHECK-NEXT: | | Flags = None +!CHECK-NEXT: | Block +!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | OmpBeginDirective +!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray' +!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8' +!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc' +!CHECK-NEXT: | | | | Flags = None +!CHECK-NEXT: | | | Block +!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective +!CHECK-NEXT: | | | | | OmpBeginDirective +!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate +!CHECK-NEXT: | | | | | | OmpClauseList -> +!CHECK-NEXT: | | | | | | Flags = None +!CHECK-NEXT: | | | | | Block +!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt -!UNPARSE: !$OMP ALLOCATE (w) ALLOCATOR(3_8) -!UNPARSE-NEXT: !$OMP ALLOCATE (xarray) ALLOCATOR(2_8) -!UNPARSE-NEXT: !$OMP ALLOCATE (zarray) ALLOCATOR(1_8) +!UNPARSE: !$OMP ALLOCATE(w) ALLOCATOR(3_8) +!UNPARSE-NEXT: t=2_4 +!UNPARSE-NEXT: z=3_4 +!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALLOCATOR(2_8) +!UNPARSE-NEXT: !$OMP ALLOCATE(zarray) ALLOCATOR(1_8) !UNPARSE-NEXT: !$OMP ALLOCATE -!UNPARSE-NEXT: ALLOCATE(w, xarray(4_4), zarray(t,z)) +!UNPARSE-NEXT: ALLOCATE(xarray(4_4), zarray(t,z)) diff --git a/flang/test/Parser/OpenMP/allocate-unparse.f90 b/flang/test/Parser/OpenMP/allocate-unparse.f90 index 63cc35cd55082..b61a97150cad2 100644 --- a/flang/test/Parser/OpenMP/allocate-unparse.f90 +++ b/flang/test/Parser/OpenMP/allocate-unparse.f90 @@ -34,19 +34,19 @@ program allocate_unparse end program allocate_unparse !CHECK:!$OMP ALLOCATE{{[ ]*$}} -!CHECK:!$OMP ALLOCATE (x,y) -!CHECK:!$OMP ALLOCATE (x,y) ALLOCATOR(omp_default_mem_alloc) -!CHECK:!$OMP ALLOCATE (a,b) +!CHECK:!$OMP ALLOCATE(x, y) +!CHECK:!$OMP ALLOCATE(x, y) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(a, b) !CHECK:ALLOCATE(darray(a,b)) !CHECK:!$OMP ALLOCATE ALLOCATOR(omp_default_mem_alloc) !CHECK:ALLOCATE(darray(a,b)) -!CHECK:!$OMP ALLOCATE (a,b) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(a, b) ALLOCATOR(omp_default_mem_alloc) !CHECK:ALLOCATE(darray(a,b)) -!CHECK:!$OMP ALLOCATE (t) ALLOCATOR(omp_const_mem_alloc) -!CHECK:!$OMP ALLOCATE (z) ALLOCATOR(omp_default_mem_alloc) -!CHECK:!$OMP ALLOCATE (m) ALLOCATOR(omp_default_mem_alloc) -!CHECK:!$OMP ALLOCATE (n) -!CHECK:!$OMP ALLOCATE (j) ALIGN(16) +!CHECK:!$OMP ALLOCATE(t) ALLOCATOR(omp_const_mem_alloc) +!CHECK:!$OMP ALLOCATE(z) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(m) ALLOCATOR(omp_default_mem_alloc) +!CHECK:!$OMP ALLOCATE(n) +!CHECK:!$OMP ALLOCATE(j) ALIGN(16) !CHECK:ALLOCATE(darray(z,t)) !CHECK:!$OMP ALLOCATE{{[ ]*$}} !CHECK:ALLOCATE(darray(a,b)) diff --git a/flang/test/Parser/OpenMP/nested-directive.f90 b/flang/test/Parser/OpenMP/nested-directive.f90 new file mode 100644 index 0000000000000..2a10bbe666bb8 --- /dev/null +++ b/flang/test/Parser/OpenMP/nested-directive.f90 @@ -0,0 +1,7 @@ +! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s 2>&1 | FileCheck %s --match-full-lines + +subroutine func + implicit none +! CHECK: !$OMP NOTHING + !$omp nothing !$omp Cannot nest directives inside directives; must be interpreted as a comment +end subroutine func diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90 index 88bcd6d2f1008..4a1e60cf73fff 100644 --- a/flang/test/Semantics/OpenMP/allocate-align01.f90 +++ b/flang/test/Semantics/OpenMP/allocate-align01.f90 @@ -11,9 +11,9 @@ program allocate_align_tree integer :: z, t, xx t = 2 z = 3 + !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage] !ERROR: Must be a constant value !$omp allocate(j) align(xx) - !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage] !ERROR: The alignment should be positive !$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc) allocate(j(z), xarray(t)) diff --git a/flang/test/Semantics/OpenMP/allocate-directive.f90 b/flang/test/Semantics/OpenMP/allocate-directive.f90 index 18a14b825f00d..e34125b392bda 100644 --- a/flang/test/Semantics/OpenMP/allocate-directive.f90 +++ b/flang/test/Semantics/OpenMP/allocate-directive.f90 @@ -11,7 +11,7 @@ integer, allocatable :: a, b, m, n, t, z !$omp allocate(x, y) !$omp allocate(x, y) allocator(omp_default_mem_alloc) - + continue !$omp allocate(a, b) allocate ( a, b ) diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90 index 229fd4d6c3f95..5fe4efdd106d9 100644 --- a/flang/test/Semantics/OpenMP/allocate01.f90 +++ b/flang/test/Semantics/OpenMP/allocate01.f90 @@ -17,7 +17,7 @@ subroutine sema() !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears !$omp allocate(y) - print *, a + print *, a !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage] !$omp allocate(x) allocator(omp_default_mem_alloc) diff --git a/flang/test/Semantics/OpenMP/allocate02.f90 b/flang/test/Semantics/OpenMP/allocate02.f90 index 8f0579e810bb9..a1e684796edb2 100644 --- a/flang/test/Semantics/OpenMP/allocate02.f90 +++ b/flang/test/Semantics/OpenMP/allocate02.f90 @@ -16,6 +16,7 @@ subroutine allocate() !ERROR: At most one ALLOCATOR clause can appear on the ALLOCATE directive !$omp allocate(x, y) allocator(omp_default_mem_alloc) allocator(omp_default_mem_alloc) + continue !$omp allocate(darray) allocator(omp_default_mem_alloc) allocate ( darray(a, b) ) diff --git a/flang/test/Semantics/OpenMP/allocate03.f90 b/flang/test/Semantics/OpenMP/allocate03.f90 index e35115f3897cc..3609f38eb6ee7 100644 --- a/flang/test/Semantics/OpenMP/allocate03.f90 +++ b/flang/test/Semantics/OpenMP/allocate03.f90 @@ -17,6 +17,7 @@ subroutine allocate() !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive !$omp allocate(my_var%array) + continue !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive !$omp allocate(darray, my_var%array) allocator(omp_default_mem_alloc) diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90 index 9b57322bbadc6..272094aaaeec2 100644 --- a/flang/test/Semantics/OpenMP/allocate06.f90 +++ b/flang/test/Semantics/OpenMP/allocate06.f90 @@ -13,7 +13,7 @@ subroutine allocate() !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute !$omp allocate(darray) allocator(omp_default_mem_alloc) - + continue !$omp allocate(darray) allocator(omp_default_mem_alloc) allocate(darray(a, b)) diff --git a/flang/test/Semantics/OpenMP/allocate10.f90 b/flang/test/Semantics/OpenMP/allocate10.f90 index a9db7330296ba..0a9e85b8ae2fe 100644 --- a/flang/test/Semantics/OpenMP/allocate10.f90 +++ b/flang/test/Semantics/OpenMP/allocate10.f90 @@ -4,8 +4,8 @@ subroutine f00 integer, allocatable :: x, y continue - !ERROR: If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items !$omp allocate + !ERROR: If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items !$omp allocate allocate(x, y) end diff --git a/flang/test/Semantics/OpenMP/allocate12.f90 b/flang/test/Semantics/OpenMP/allocate12.f90 new file mode 100644 index 0000000000000..2b3b510fbf40c --- /dev/null +++ b/flang/test/Semantics/OpenMP/allocate12.f90 @@ -0,0 +1,16 @@ +!RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51 + +subroutine f00 + integer, allocatable :: x + continue + !ERROR: An executable ALLOCATE directive must be associated with an ALLOCATE statement + !$omp allocate(x) +end + +subroutine f01 + integer, allocatable :: x + continue + !$omp allocate(x) + !ERROR: The statement associated with executable ALLOCATE directive must be an ALLOCATE statement + continue +end diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index ae555a256ba66..4e6b4195a9c5e 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -47,8 +47,6 @@ set(LIBC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(LIBC_ENABLE_USE_BY_CLANG OFF CACHE BOOL "Whether or not to place libc in a build directory findable by a just built clang") -set(LIBC_KERNEL_HEADERS "/usr/include" CACHE STRING "Path to Linux kernel headers") - # Defining a global namespace to enclose all libc functions. set(default_namespace "__llvm_libc") if(LLVM_VERSION_MAJOR) @@ -146,6 +144,11 @@ option(LLVM_LIBC_ALL_HEADERS "Outputs all functions in header files, regardless option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF) +if(LIBC_TARGET_OS_IS_LINUX) + set(kernel_headers "/usr/include") +endif() +set(LIBC_KERNEL_HEADERS "${kernel_headers}" CACHE STRING "Path to Linux kernel headers") + set(LIBC_ENABLE_UNITTESTS ON) set(LIBC_ENABLE_HERMETIC_TESTS ${LLVM_LIBC_FULL_BUILD}) diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 714120a79e39a..e0dd15b803253 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -325,8 +325,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.dup2 libc.src.unistd.dup3 libc.src.unistd.execve - # Disabled while SYS_faccessat2 is unavailable on the buildbot. - # libc.src.unistd.faccessat + libc.src.unistd.faccessat libc.src.unistd.fchdir libc.src.unistd.fpathconf libc.src.unistd.fsync diff --git a/libc/config/linux/aarch64/exclude.txt b/libc/config/linux/aarch64/exclude.txt new file mode 100644 index 0000000000000..f2f553f78933c --- /dev/null +++ b/libc/config/linux/aarch64/exclude.txt @@ -0,0 +1,8 @@ +include(CheckSymbolExists) +check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2) +if(NOT HAVE_SYS_FACCESSAT2) + message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system") + list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS + libc.src.unistd.faccessat + ) +endif() diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index f6bbb346d10e5..0d031d8844f13 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -329,6 +329,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.dup2 libc.src.unistd.dup3 libc.src.unistd.execve + libc.src.unistd.faccessat libc.src.unistd.fchdir libc.src.unistd.fpathconf libc.src.unistd.fsync diff --git a/libc/config/linux/riscv/exclude.txt b/libc/config/linux/riscv/exclude.txt new file mode 100644 index 0000000000000..f2f553f78933c --- /dev/null +++ b/libc/config/linux/riscv/exclude.txt @@ -0,0 +1,8 @@ +include(CheckSymbolExists) +check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2) +if(NOT HAVE_SYS_FACCESSAT2) + message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system") + list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS + libc.src.unistd.faccessat + ) +endif() diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 7a8d74a4e5da9..a44e2041e57f2 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -326,6 +326,7 @@ set(TARGET_LIBC_ENTRYPOINTS # unistd.h entrypoints libc.src.unistd.access libc.src.unistd.chdir + libc.src.unistd.chown libc.src.unistd.close libc.src.unistd.dup libc.src.unistd.dup2 @@ -344,6 +345,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.unistd.getppid libc.src.unistd.getsid libc.src.unistd.gettid + libc.src.unistd.getgid libc.src.unistd.getuid libc.src.unistd.isatty libc.src.unistd.link diff --git a/libc/config/linux/x86_64/exclude.txt b/libc/config/linux/x86_64/exclude.txt index a0686310d21ac..31b60a9c3497c 100644 --- a/libc/config/linux/x86_64/exclude.txt +++ b/libc/config/linux/x86_64/exclude.txt @@ -23,6 +23,7 @@ endif() include(CheckSymbolExists) check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2) if(NOT HAVE_SYS_FACCESSAT2) + message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system") list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS libc.src.unistd.faccessat ) diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index 225843924c243..433c47b174766 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -479,3 +479,11 @@ add_proxy_header_library( libc.include.llvm-libc-types.struct_rlimit libc.include.sys_resource ) + +add_proxy_header_library( + gid_t + HDRS + gid_t.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-types.gid_t +) diff --git a/libc/hdr/types/gid_t.h b/libc/hdr/types/gid_t.h new file mode 100644 index 0000000000000..bc274aaa9a8a8 --- /dev/null +++ b/libc/hdr/types/gid_t.h @@ -0,0 +1,22 @@ +//===-- Proxy for gid_t ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_TYPES_GID_T_H +#define LLVM_LIBC_HDR_TYPES_GID_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/gid_t.h" + +#else // Overlay mode + +#include <sys/types.h> + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_TYPES_GID_T_H diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml index 2ff86eafaf550..0e5b22e627b67 100644 --- a/libc/include/unistd.yaml +++ b/libc/include/unistd.yaml @@ -3,6 +3,7 @@ header_template: unistd.h.def macros: [] types: - type_name: uid_t + - type_name: gid_t - type_name: ssize_t - type_name: size_t - type_name: pid_t @@ -54,6 +55,14 @@ functions: return_type: int arguments: - type: const char * + - name: chown + standards: + - POSIX + return_type: int + arguments: + - type: const char * + - type: uid_t + - type: gid_t - name: close standards: - POSIX @@ -195,6 +204,12 @@ functions: return_type: uid_t arguments: - type: void + - name: getgid + standards: + - POSIX + return_type: gid_t + arguments: + - type: void - name: isatty standards: - POSIX diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h index cab146a5b8698..9115ed2856881 100644 --- a/libc/src/__support/float_to_string.h +++ b/libc/src/__support/float_to_string.h @@ -700,7 +700,11 @@ template <> class FloatToString<long double> { const int SHIFT_AMOUNT = FLOAT_AS_INT_WIDTH + exponent; static_assert(EXTRA_INT_WIDTH >= sizeof(long double) * 8); - float_as_fixed <<= SHIFT_AMOUNT; + if (SHIFT_AMOUNT > 0) { + float_as_fixed <<= SHIFT_AMOUNT; + } else { + float_as_fixed >>= -SHIFT_AMOUNT; + } // If there are still digits above the decimal point, handle those. if (cpp::countl_zero(float_as_fixed) < @@ -769,7 +773,7 @@ template <> class FloatToString<long double> { // The decimal representation of 2**(-i) will have exactly i digits after // the decimal point. const int num_requested_digits = - static_cast<int>((negative_block_index + 1) * BLOCK_SIZE); + static_cast<int>(negative_block_index * BLOCK_SIZE); return num_requested_digits > -exponent; } diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt index 78c3bf8442fab..337480cbbf928 100644 --- a/libc/src/unistd/CMakeLists.txt +++ b/libc/src/unistd/CMakeLists.txt @@ -27,6 +27,13 @@ add_entrypoint_object( .${LIBC_TARGET_OS}.chdir ) +add_entrypoint_object( + chown + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.chown +) + add_entrypoint_object( close ALIAS @@ -160,6 +167,13 @@ add_entrypoint_object( .${LIBC_TARGET_OS}.getuid ) +add_entrypoint_object( + getgid + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.getgid +) + add_entrypoint_object( isatty ALIAS diff --git a/libc/src/unistd/chown.h b/libc/src/unistd/chown.h new file mode 100644 index 0000000000000..84a8eba2cb2e6 --- /dev/null +++ b/libc/src/unistd/chown.h @@ -0,0 +1,22 @@ +//===-- Implementation header for chown -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_UNISTD_CHOWN_H +#define LLVM_LIBC_SRC_UNISTD_CHOWN_H + +#include "hdr/types/gid_t.h" +#include "hdr/types/uid_t.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int chown(const char *path, uid_t owner, gid_t group); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_UNISTD_CHOWN_H diff --git a/libc/src/unistd/getgid.h b/libc/src/unistd/getgid.h new file mode 100644 index 0000000000000..eed0b20d688b1 --- /dev/null +++ b/libc/src/unistd/getgid.h @@ -0,0 +1,22 @@ +//===-- Implementation header for getgid ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_UNISTD_GETGID_H +#define LLVM_LIBC_SRC_UNISTD_GETGID_H + +#include "hdr/types/gid_t.h" +#include "hdr/unistd_macros.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +gid_t getgid(); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_UNISTD_GETGID_H diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt index 4eb3c7d3d7fae..c2dacc6456e27 100644 --- a/libc/src/unistd/linux/CMakeLists.txt +++ b/libc/src/unistd/linux/CMakeLists.txt @@ -25,6 +25,20 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + chown + SRCS + chown.cpp + HDRS + ../chown.h + DEPENDS + libc.hdr.types.uid_t + libc.hdr.types.gid_t + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil + libc.src.errno.errno +) + add_entrypoint_object( close SRCS @@ -276,6 +290,20 @@ add_entrypoint_object( libc.src.errno.errno ) +add_entrypoint_object( + getgid + SRCS + getgid.cpp + HDRS + ../getgid.h + DEPENDS + libc.hdr.types.gid_t + libc.hdr.fcntl_macros + libc.include.unistd + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil +) + add_entrypoint_object( getuid SRCS diff --git a/libc/src/unistd/linux/chown.cpp b/libc/src/unistd/linux/chown.cpp new file mode 100644 index 0000000000000..c7bf1703ffe57 --- /dev/null +++ b/libc/src/unistd/linux/chown.cpp @@ -0,0 +1,29 @@ +//===-- Linux implementation of chown -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/chown.h" + +#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/common.h" + +#include "src/__support/libc_errno.h" +#include "src/__support/macros/config.h" +#include <sys/syscall.h> // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, chown, (const char *path, uid_t owner, gid_t group)) { + int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_chown, path, owner, group); + if (ret < 0) { + libc_errno = -ret; + return -1; + } + return 0; +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/unistd/linux/getgid.cpp b/libc/src/unistd/linux/getgid.cpp new file mode 100644 index 0000000000000..1656fd601d843 --- /dev/null +++ b/libc/src/unistd/linux/getgid.cpp @@ -0,0 +1,23 @@ +//===-- Linux implementation of getgid ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/unistd/getgid.h" + +#include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +#include <sys/syscall.h> // For syscall numbers. + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(gid_t, getgid, ()) { + return LIBC_NAMESPACE::syscall_impl<gid_t>(SYS_getgid); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/UnitTest/FEnvSafeTest.cpp b/libc/test/UnitTest/FEnvSafeTest.cpp index 4393f9d5e5c3b..64f50d7be7fe3 100644 --- a/libc/test/UnitTest/FEnvSafeTest.cpp +++ b/libc/test/UnitTest/FEnvSafeTest.cpp @@ -43,7 +43,8 @@ void FEnvSafeTest::set_fenv(const fenv_t &fenv) { void FEnvSafeTest::expect_fenv_eq(const fenv_t &before_fenv, const fenv_t &after_fenv) { -#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC) +#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC) && \ + defined(__ARM_FP) using FPState = LIBC_NAMESPACE::fputil::FEnv::FPState; const FPState &before_state = reinterpret_cast<const FPState &>(before_fenv); const FPState &after_state = reinterpret_cast<const FPState &>(after_fenv); diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp index f1b545ba546f9..42fdd59cf4d9c 100644 --- a/libc/test/src/stdio/sprintf_test.cpp +++ b/libc/test/src/stdio/sprintf_test.cpp @@ -1537,6 +1537,14 @@ TEST(LlvmLibcSPrintfTest, FloatDecimalLongDoubleConv) { #if defined(LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80) #ifndef LIBC_COPT_FLOAT_TO_STR_REDUCED_PRECISION + written = LIBC_NAMESPACE::sprintf( + buff, "%.75Lf", + 0.0833333333333333333355920878593448009041821933351457118988037109375L); + ASSERT_STREQ_LEN(written, buff, + "0." + "08333333333333333333559208785934480090418219333514571189880" + "3710937500000000"); + written = LIBC_NAMESPACE::sprintf(buff, "%Lf", 1e100L); ASSERT_STREQ_LEN(written, buff, "99999999999999999996693535322073426194986990198284960792713" @@ -2976,6 +2984,10 @@ TEST(LlvmLibcSPrintfTest, FloatAutoLongDoubleConv) { written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xf.fffffffffffffffp+16380L); ASSERT_STREQ_LEN(written, buff, "1.18973e+4932"); + // Minimum normal + written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 3.36210314311209350626E-4932L); + ASSERT_STREQ_LEN(written, buff, "3.3621e-4932"); + written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xa.aaaaaaaaaaaaaabp-7L); ASSERT_STREQ_LEN(written, buff, "0.0833333"); diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt index 44f28fff9ad39..07070535459ec 100644 --- a/libc/test/src/unistd/CMakeLists.txt +++ b/libc/test/src/unistd/CMakeLists.txt @@ -36,6 +36,26 @@ add_libc_unittest( libc.test.UnitTest.ErrnoSetterMatcher ) +add_libc_unittest( + chown_test + SUITE + libc_unistd_unittests + SRCS + chown_test.cpp + DEPENDS + libc.hdr.fcntl_macros + libc.include.unistd + libc.src.errno.errno + libc.src.unistd.chown + libc.src.unistd.close + libc.src.unistd.unlink + libc.src.fcntl.open + libc.src.unistd.getuid + libc.src.unistd.getgid + libc.test.UnitTest.ErrnoCheckingTest + libc.test.UnitTest.ErrnoSetterMatcher +) + add_libc_unittest( dup_test SUITE @@ -437,6 +457,16 @@ add_libc_unittest( libc.test.UnitTest.ErrnoCheckingTest ) +add_libc_unittest( + getgid_test + SUITE + libc_unistd_unittests + SRCS + getgid_test.cpp + DEPENDS + libc.src.unistd.getgid +) + add_libc_unittest( getpid_test SUITE diff --git a/libc/test/src/unistd/chown_test.cpp b/libc/test/src/unistd/chown_test.cpp new file mode 100644 index 0000000000000..8b1f783273624 --- /dev/null +++ b/libc/test/src/unistd/chown_test.cpp @@ -0,0 +1,51 @@ +//===-- Unittests for chown -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/fcntl/open.h" +#include "src/unistd/chown.h" +#include "src/unistd/close.h" +#include "src/unistd/getgid.h" +#include "src/unistd/getuid.h" +#include "src/unistd/unlink.h" + +#include "test/UnitTest/ErrnoCheckingTest.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/Test.h" + +#include "hdr/fcntl_macros.h" +#include <sys/stat.h> + +using LlvmLibcChownTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcChownTest, ChownSuccess) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + uid_t my_uid = LIBC_NAMESPACE::getuid(); + gid_t my_gid = LIBC_NAMESPACE::getgid(); + constexpr const char *FILENAME = "chown.test"; + auto TEST_FILE = libc_make_test_file_path(FILENAME); + + // Create a test file. + int write_fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(write_fd, 0); + + // Change the ownership of the file. + ASSERT_THAT(LIBC_NAMESPACE::chown(TEST_FILE, my_uid, my_gid), Succeeds(0)); + + // Close the file descriptor. + ASSERT_THAT(LIBC_NAMESPACE::close(write_fd), Succeeds(0)); + + // Clean up the test file. + ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0)); +} + +TEST_F(LlvmLibcChownTest, ChownNonExistentFile) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; + ASSERT_THAT(LIBC_NAMESPACE::chown("non-existent-file", 1000, 1000), + Fails(ENOENT)); +} diff --git a/llvm/lib/BinaryFormat/Minidump.cpp b/libc/test/src/unistd/getgid_test.cpp similarity index 51% rename from llvm/lib/BinaryFormat/Minidump.cpp rename to libc/test/src/unistd/getgid_test.cpp index b618fb1570126..77dbad2f18e00 100644 --- a/llvm/lib/BinaryFormat/Minidump.cpp +++ b/libc/test/src/unistd/getgid_test.cpp @@ -1,4 +1,4 @@ -//===-- Minidump.cpp - Minidump constants and structures ---------*- C++-*-===// +//===-- Unittests for getgid ----------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,9 +6,10 @@ // //===----------------------------------------------------------------------===// -#include "llvm/BinaryFormat/Minidump.h" +#include "src/unistd/getgid.h" +#include "test/UnitTest/Test.h" -using namespace llvm::minidump; - -constexpr uint32_t Header::MagicSignature; -constexpr uint16_t Header::MagicVersion; +TEST(LlvmLibcGetGidTest, SmokeTest) { + // getgid always succeeds. So, we just call it as a smoke test. + LIBC_NAMESPACE::getgid(); +} diff --git a/libclc/clc/include/clc/math/clc_cbrt.inc b/libclc/clc/include/clc/math/clc_cbrt.h similarity index 100% rename from libclc/clc/include/clc/math/clc_cbrt.inc rename to libclc/clc/include/clc/math/clc_cbrt.h diff --git a/libclc/clc/lib/generic/math/clc_cbrt.cl b/libclc/clc/lib/generic/math/clc_cbrt.cl index 105f6329d5bad..935b7b7eae78c 100644 --- a/libclc/clc/lib/generic/math/clc_cbrt.cl +++ b/libclc/clc/lib/generic/math/clc_cbrt.cl @@ -8,6 +8,7 @@ #include <clc/clc_convert.h> #include <clc/internal/clc.h> +#include <clc/math/clc_cbrt.h> #include <clc/math/clc_copysign.h> #include <clc/math/clc_fabs.h> #include <clc/math/clc_fma.h> diff --git a/libclc/opencl/lib/generic/math/cbrt.cl b/libclc/opencl/lib/generic/math/cbrt.cl index 0d670150ed4c9..7de61436522b3 100644 --- a/libclc/opencl/lib/generic/math/cbrt.cl +++ b/libclc/opencl/lib/generic/math/cbrt.cl @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include <clc/math/clc_cbrt.inc> +#include <clc/math/clc_cbrt.h> #include <clc/opencl/math/cbrt.h> #define __CLC_FUNCTION cbrt diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst index dbe69484abedf..e15c5b1a5d32f 100644 --- a/libcxx/docs/TestingLibcxx.rst +++ b/libcxx/docs/TestingLibcxx.rst @@ -451,7 +451,7 @@ Instead use: .. code-block:: cpp - // UNSUPPORTED: std-at-least-c++26 + // REQUIRES: std-at-least-c++26 There is no corresponding ``std-at-most-c++23``. This could be useful when tests are only valid for a small set of standard versions. For example, a diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h index 796fa924be121..aef036a2c9586 100644 --- a/libcxx/include/__exception/exception_ptr.h +++ b/libcxx/include/__exception/exception_ptr.h @@ -11,18 +11,24 @@ #include <__config> #include <__cstddef/nullptr_t.h> +#include <__cstddef/size_t.h> #include <__exception/operations.h> #include <__memory/addressof.h> #include <__memory/construct_at.h> #include <__type_traits/decay.h> #include <__type_traits/is_pointer.h> -#include <cstdlib> +#include <__utility/move.h> +#include <__utility/swap.h> +#include <__verbose_abort> #include <typeinfo> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + #ifndef _LIBCPP_ABI_MICROSOFT # if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION @@ -30,7 +36,7 @@ namespace __cxxabiv1 { extern "C" { -_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(size_t) throw(); +_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(std::size_t) throw(); _LIBCPP_OVERRIDABLE_FUNC_VIS void __cxa_free_exception(void*) throw(); struct __cxa_exception; @@ -57,6 +63,8 @@ _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD #ifndef _LIBCPP_ABI_MICROSOFT +inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT; + class _LIBCPP_EXPORTED_FROM_ABI exception_ptr { void* __ptr_; @@ -75,7 +83,15 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr { _LIBCPP_HIDE_FROM_ABI exception_ptr(nullptr_t) _NOEXCEPT : __ptr_() {} exception_ptr(const exception_ptr&) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI exception_ptr(exception_ptr&& __other) _NOEXCEPT : __ptr_(__other.__ptr_) { + __other.__ptr_ = nullptr; + } exception_ptr& operator=(const exception_ptr&) _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI exception_ptr& operator=(exception_ptr&& __other) _NOEXCEPT { + exception_ptr __tmp(std::move(__other)); + std::swap(__tmp, *this); + return *this; + } ~exception_ptr() _NOEXCEPT; _LIBCPP_HIDE_FROM_ABI explicit operator bool() const _NOEXCEPT { return __ptr_ != nullptr; } @@ -88,10 +104,16 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr { return !(__x == __y); } + friend _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT; + friend _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT; friend _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr); }; +inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT { + std::swap(__x.__ptr_, __y.__ptr_); +} + # if _LIBCPP_HAS_EXCEPTIONS # if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION template <class _Ep> @@ -153,7 +175,7 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT { # else // !_LIBCPP_HAS_EXCEPTIONS template <class _Ep> _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep) _NOEXCEPT { - std::abort(); + _LIBCPP_VERBOSE_ABORT("make_exception_ptr was called in -fno-exceptions mode"); } # endif // _LIBCPP_HAS_EXCEPTIONS @@ -201,4 +223,6 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT { #endif // _LIBCPP_ABI_MICROSOFT _LIBCPP_END_UNVERSIONED_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___EXCEPTION_EXCEPTION_PTR_H diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h index 9bd3af12c9572..7a7410b5dea08 100644 --- a/libcxx/include/__numeric/saturation_arithmetic.h +++ b/libcxx/include/__numeric/saturation_arithmetic.h @@ -30,6 +30,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept { +# if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101 + return __builtin_elementwise_add_sat(__x, __y); +# else if (_Tp __sum; !__builtin_add_overflow(__x, __y, std::addressof(__sum))) return __sum; // Handle overflow @@ -44,10 +47,14 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept { // Overflows if (x < 0 && y < 0) return std::numeric_limits<_Tp>::min(); } +# endif } template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept { +# if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101 + return __builtin_elementwise_sub_sat(__x, __y); +# else if (_Tp __sub; !__builtin_sub_overflow(__x, __y, std::addressof(__sub))) return __sub; // Handle overflow @@ -63,6 +70,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept { // Overflows if (x < 0 && y > 0) return std::numeric_limits<_Tp>::min(); } +# endif } template <__signed_or_unsigned_integer _Tp> diff --git a/libcxx/include/__type_traits/reference_constructs_from_temporary.h b/libcxx/include/__type_traits/reference_constructs_from_temporary.h index 3d097ce90cb09..a8325620414ea 100644 --- a/libcxx/include/__type_traits/reference_constructs_from_temporary.h +++ b/libcxx/include/__type_traits/reference_constructs_from_temporary.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary) +#if _LIBCPP_STD_VER >= 23 template <class _Tp, class _Up> struct _LIBCPP_NO_SPECIALIZATIONS reference_constructs_from_temporary diff --git a/libcxx/include/__type_traits/reference_converts_from_temporary.h b/libcxx/include/__type_traits/reference_converts_from_temporary.h index c68f1765af9d5..9c51225e53b8e 100644 --- a/libcxx/include/__type_traits/reference_converts_from_temporary.h +++ b/libcxx/include/__type_traits/reference_converts_from_temporary.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_converts_from_temporary) +#if _LIBCPP_STD_VER >= 23 template <class _Tp, class _Up> struct _LIBCPP_NO_SPECIALIZATIONS reference_converts_from_temporary diff --git a/libcxx/include/exception b/libcxx/include/exception index 74229cd16c006..0b2372e571e99 100644 --- a/libcxx/include/exception +++ b/libcxx/include/exception @@ -93,10 +93,13 @@ template <class E> void rethrow_if_nested(const E& e); # if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include <cstddef> -# include <cstdlib> # include <new> # include <type_traits> # endif + +# if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 23 +# include <cstdlib> +# endif #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS) #endif // _LIBCPP_EXCEPTION diff --git a/libcxx/include/string b/libcxx/include/string index 8f80afbc2fd37..ede42467b99fe 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -644,6 +644,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len ); # include <__utility/forward.h> # include <__utility/is_pointer_in_range.h> # include <__utility/move.h> +# include <__utility/no_destroy.h> # include <__utility/scope_guard.h> # include <__utility/swap.h> # include <climits> @@ -914,6 +915,11 @@ private: union __rep { __short __s; __long __l; + + __rep() = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__short __r) : __s(__r) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__long __r) : __l(__r) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__uninitialized_tag) {} }; _LIBCPP_COMPRESSED_PAIR(__rep, __rep_, allocator_type, __alloc_); @@ -1206,7 +1212,10 @@ public: } # endif // _LIBCPP_CXX03_LANG - inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(); } + // TODO(boomanaiden154): Once we mark this in destructors as dead on return, + // we can use a normal call to __reset_internal_buffer and remove the extra + // __rep constructor. + inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(__rep(__uninitialized_tag())); } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 operator __self_view() const _NOEXCEPT { return __self_view(typename __self_view::__assume_valid(), data(), size()); @@ -2259,18 +2268,12 @@ private: return __long(__buffer, __capacity); } - // Deallocate the long buffer if it exists and clear the short buffer so we are an empty string - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer() { + // Replace the current buffer with __new_rep. Deallocate the old long buffer if it exists. + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer(__rep __new_rep = __short()) { __annotate_delete(); if (__is_long()) __alloc_traits::deallocate(__alloc_, __get_long_pointer(), __get_long_cap()); - __rep_.__s = __short(); - } - - // Replace the current buffer with __alloc; the first __size elements constitute a string - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __replace_internal_buffer(__long __alloc) { - __reset_internal_buffer(); - __rep_.__l = __alloc; + __rep_ = __new_rep; } // Initialize the internal buffer to hold __size elements @@ -2444,7 +2447,7 @@ private: __annotate_delete(); auto __guard = std::__make_scope_guard(__annotate_new_size(*this)); auto __alloc = __str.__alloc_; - __replace_internal_buffer(__allocate_long_buffer(__alloc, __str.size())); + __reset_internal_buffer(__allocate_long_buffer(__alloc, __str.size())); __alloc_ = std::move(__alloc); } } @@ -2710,7 +2713,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__ __sec_cp_sz); __buffer.__size_ = __n_copy + __n_add + __sec_cp_sz; traits_type::assign(__buffer.__data_[__buffer.__size_], value_type()); - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); } // __grow_by is deprecated because it does not set the size. It may not update the size when the size is changed, and it @@ -2746,7 +2749,7 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait // This is -1 to make sure the caller sets the size properly, since old versions of this function didn't set the size // at all. __buffer.__size_ = -1; - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); } template <class _CharT, class _Traits, class _Allocator> @@ -3394,7 +3397,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::re __long __buffer = __allocate_long_buffer(__alloc_, __requested_capacity); __buffer.__size_ = size(); traits_type::copy(std::__to_address(__buffer.__data_), data(), __buffer.__size_ + 1); - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); } template <class _CharT, class _Traits, class _Allocator> @@ -3433,7 +3436,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat } traits_type::copy(std::__to_address(__buffer.__data_), std::__to_address(__get_long_pointer()), __size + 1); - __replace_internal_buffer(__buffer); + __reset_internal_buffer(__buffer); # if _LIBCPP_HAS_EXCEPTIONS } catch (...) { return; diff --git a/libcxx/include/tuple b/libcxx/include/tuple index 466f501b5f4f8..3c5330dd6e14e 100644 --- a/libcxx/include/tuple +++ b/libcxx/include/tuple @@ -1443,7 +1443,7 @@ template <class _Tp, class _Tuple, class = enable_if_t<__can_make_from_tuple<_Tp inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp make_from_tuple(_Tuple&& __t) noexcept(noexcept(std::__make_from_tuple_impl<_Tp>(std::forward<_Tuple>(__t), make_index_sequence<tuple_size_v<remove_reference_t<_Tuple>>>()))) { -#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary) +#if _LIBCPP_STD_VER >= 23 if constexpr (tuple_size_v<remove_reference_t<_Tuple>> == 1) { static_assert(!std::reference_constructs_from_temporary_v<_Tp, decltype(std::get<0>(std::declval<_Tuple>()))>, "Attempted construction of reference element binds to a temporary whose lifetime has ended"); diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo index 24aaabf0a87df..f608c94d3031e 100644 --- a/libcxx/include/typeinfo +++ b/libcxx/include/typeinfo @@ -186,99 +186,99 @@ public: # endif # endif -struct __type_info_implementations { - struct __string_impl_base { - typedef const char* __type_name_t; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char* - __type_name_to_string(__type_name_t __v) _NOEXCEPT { - return __v; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t - __string_to_type_name(const char* __v) _NOEXCEPT { - return __v; - } - }; +namespace __type_info_implementations { +struct __string_impl_base { + typedef const char* __type_name_t; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char* + __type_name_to_string(__type_name_t __v) _NOEXCEPT { + return __v; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t + __string_to_type_name(const char* __v) _NOEXCEPT { + return __v; + } +}; - struct __unique_impl : __string_impl_base { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { - return reinterpret_cast<size_t>(__v); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __lhs == __rhs; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __lhs < __rhs; - } - }; - - struct __non_unique_impl : __string_impl_base { - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT { - size_t __hash = 5381; - while (unsigned char __c = static_cast<unsigned char>(*__ptr++)) - __hash = (__hash * 33) ^ __c; - return __hash; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - return __builtin_strcmp(__lhs, __rhs) < 0; - } - }; +struct __unique_impl : __string_impl_base { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { + return reinterpret_cast<size_t>(__v); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __lhs == __rhs; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __lhs < __rhs; + } +}; + +struct __non_unique_impl : __string_impl_base { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT { + size_t __hash = 5381; + while (unsigned char __c = static_cast<unsigned char>(*__ptr++)) + __hash = (__hash * 33) ^ __c; + return __hash; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + return __builtin_strcmp(__lhs, __rhs) < 0; + } +}; - struct __non_unique_arm_rtti_bit_impl { - typedef uintptr_t __type_name_t; +struct __non_unique_arm_rtti_bit_impl { + typedef uintptr_t __type_name_t; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT { - return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT { - return reinterpret_cast<__type_name_t>(__v); - } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT { + return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT { + return reinterpret_cast<__type_name_t>(__v); + } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { - if (__is_type_name_unique(__v)) - return __v; - return __non_unique_impl::__hash(__type_name_to_string(__v)); - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - if (__lhs == __rhs) - return true; - if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) - // Either both are unique and have a different address, or one of them - // is unique and the other one isn't. In both cases they are unequal. - return false; - return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0; - } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { - if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) - return __lhs < __rhs; - return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0; - } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT { + if (__is_type_name_unique(__v)) + return __v; + return __non_unique_impl::__hash(__type_name_to_string(__v)); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + if (__lhs == __rhs) + return true; + if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) + // Either both are unique and have a different address, or one of them + // is unique and the other one isn't. In both cases they are unequal. + return false; + return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT { + if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs)) + return __lhs < __rhs; + return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0; + } - private: - // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when - // this implementation is actually used. - typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))> - __non_unique_rtti_bit; +private: + // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when + // this implementation is actually used. + typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))> + __non_unique_rtti_bit; - _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT { - return !(__lhs & __non_unique_rtti_bit::value); - } - }; + _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT { + return !(__lhs & __non_unique_rtti_bit::value); + } +}; - typedef +typedef # if _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 1 - __unique_impl + __unique_impl # elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 2 - __non_unique_impl + __non_unique_impl # elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 3 - __non_unique_arm_rtti_bit_impl + __non_unique_arm_rtti_bit_impl # else # error invalid configuration for _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION # endif - __impl; -}; + __impl; +} // namespace __type_info_implementations # if __has_cpp_attribute(_Clang::__ptrauth_vtable_pointer__) # if __has_feature(ptrauth_type_info_vtable_pointer_discrimination) diff --git a/libcxx/modules/std/exception.inc b/libcxx/modules/std/exception.inc index 02b0f80190e5b..3dbc0112c15a0 100644 --- a/libcxx/modules/std/exception.inc +++ b/libcxx/modules/std/exception.inc @@ -18,6 +18,7 @@ export namespace std { using std::rethrow_exception; using std::rethrow_if_nested; using std::set_terminate; + using std::swap; using std::terminate; using std::terminate_handler; using std::throw_with_nested; diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index d047b29b63cc6..8c3e1f0a97dfe 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -245,7 +245,6 @@ deque stdexcept deque tuple deque version exception cstdint -exception cstdlib exception typeinfo exception version execution version diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp index 897ae89365014..3fac952b9eb98 100644 --- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp +++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp @@ -154,14 +154,10 @@ SPECIALIZE_UTT(is_unbounded_array); // expected-error 2 {{cannot be speciali # endif # if TEST_STD_VER >= 23 -SPECIALIZE_UTT(is_implicit_lifetime); // expected-error 2 {{cannot be specialized}} -SPECIALIZE_UTT(is_scoped_enum); // expected-error 2 {{cannot be specialized}} -# if __has_builtin(__reference_constructs_from_temporary) +SPECIALIZE_UTT(is_implicit_lifetime); // expected-error 2 {{cannot be specialized}} +SPECIALIZE_UTT(is_scoped_enum); // expected-error 2 {{cannot be specialized}} SPECIALIZE_BTT(reference_constructs_from_temporary); // expected-error 2 {{cannot be specialized}} -# endif -# if __has_builtin(__reference_converts_from_temporary) -SPECIALIZE_BTT(reference_converts_from_temporary); // expected-error 2 {{cannot be specialized}} -# endif +SPECIALIZE_BTT(reference_converts_from_temporary); // expected-error 2 {{cannot be specialized}} # endif # if TEST_STD_VER >= 26 diff --git a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp index 13bd761ae9808..602bd1612015d 100644 --- a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp @@ -9,7 +9,7 @@ // https://github.com/llvm/llvm-project/issues/30023 // compare exchange does not work with types of which the size is not a power of 2 -// XFAIL: clang-19, clang-20, clang-21, apple-clang-15, apple-clang-16, apple-clang-17 +// XFAIL: clang-20, clang-21, apple-clang-17 // UNSUPPORTED: c++03 // TODO: remove the UNSUPPORTED clang-22 once libc++ CI's clang is updated to include diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp index 731d751df08d9..dc4d8ae2851f4 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp @@ -11,7 +11,6 @@ // UNSUPPORTED: c++03, c++11 // These compiler versions and platforms don't enable sized deallocation by default. -// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp index 64a26ed63e8ce..834c01b2272e2 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp @@ -11,7 +11,6 @@ // UNSUPPORTED: c++03, c++11 // These compiler versions and platforms don't enable sized deallocation by default. -// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp index 0aded33e660d5..7e25d40dc8a7d 100644 --- a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp @@ -14,7 +14,6 @@ #include <exception> #include <cassert> -#include <type_traits> #include "test_macros.h" diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp new file mode 100644 index 0000000000000..6882bc6548da3 --- /dev/null +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-exceptions, c++03 + +// <exception> + +// typedef unspecified exception_ptr; + +// Test the move assignment of exception_ptr + +#include <exception> +#include <utility> +#include <cassert> + +#include "test_macros.h" + +int main(int, char**) { + std::exception_ptr p = std::make_exception_ptr(42); + std::exception_ptr p2{p}; + assert(p2 == p); + // Under test: the move assignment + std::exception_ptr p3; + p3 = std::move(p2); + assert(p3 == p); +// `p2` was moved from. In libc++ it will be nullptr, but +// this is not guaranteed by the standard. +#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT) + assert(p2 == nullptr); + assert(p2 == nullptr); +#endif + + try { + std::rethrow_exception(p3); + } catch (int e) { + assert(e == 42); + } + + return 0; +} diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp new file mode 100644 index 0000000000000..122e229fd6e47 --- /dev/null +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-exceptions, c++03 + +// <exception> + +// typedef unspecified exception_ptr; + +// Test the move constructor of exception_ptr + +#include <exception> +#include <utility> +#include <cassert> + +#include "test_macros.h" + +int main(int, char**) { + std::exception_ptr p = std::make_exception_ptr(42); + std::exception_ptr p2{p}; + assert(p2 == p); + // Under test: The move constructor + std::exception_ptr p3{std::move(p2)}; + assert(p3 == p); +// `p2` was moved from. In libc++ it will be nullptr, but +// this is not guaranteed by the standard. +#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT) + assert(p2 == nullptr); +#endif + + try { + std::rethrow_exception(p3); + } catch (int e) { + assert(e == 42); + } + + return 0; +} diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp new file mode 100644 index 0000000000000..82b4713bed538 --- /dev/null +++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp @@ -0,0 +1,40 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: no-exceptions + +// <exception> + +// typedef unspecified exception_ptr; + +// Test swapping of exception_ptr + +#include <exception> +#include <utility> +#include <cassert> + +#include "test_macros.h" + +int main(int, char**) { + std::exception_ptr p21 = std::make_exception_ptr(42); + std::exception_ptr p42 = std::make_exception_ptr(21); + std::swap(p42, p21); + + try { + std::rethrow_exception(p21); + } catch (int e) { + assert(e == 21); + } + try { + std::rethrow_exception(p42); + } catch (int e) { + assert(e == 42); + } + + return 0; +} diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp index 7571ced2e4431..233e8ed2338b6 100644 --- a/libcxx/test/std/numerics/c.math/signbit.pass.cpp +++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp @@ -12,7 +12,7 @@ // UNSUPPORTED: windows // These compilers don't support constexpr `__builtin_signbit` yet. -// UNSUPPORTED: clang-19, apple-clang-16, apple-clang-17 +// UNSUPPORTED: apple-clang-17 // GCC warns about signbit comparing `bool_v < 0`, which we're testing // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp index 6bd112c7d1280..f49e19acf0234 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp index bdfc57694dd53..0789213163847 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp index 1fe7916c67823..f09bf30771102 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp index b797ae7533add..86e2e61647be8 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class R, class T> diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp index 8b6188f1fad0e..c2be8c5a47bdf 100644 --- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++26 -// The test uses "Placeholder variables with no name" -// UNSUPPORTED: apple-clang-16 - // <numeric> // template<class T> diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp index f71f688afc52e..bb0b2d322218d 100644 --- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp +++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp @@ -10,7 +10,6 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // These compiler versions and platforms don't enable sized deallocation by default. -// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp index 2e0b1e9025a61..b6b226c7f79d8 100644 --- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23 // These compilers don't support __builtin_is_virtual_base_of yet. -// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17 +// UNSUPPORTED: apple-clang-17 // <type_traits> diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp index 1ca9d44b82afe..f43693c08bc39 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // These compilers don't support __builtin_is_implicit_lifetime yet. -// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17 +// UNSUPPORTED: apple-clang-17 // <type_traits> diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp index ad53c8176cc92..84fe7cfb02208 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++23 -// These compilers don't support std::reference_converts_from_temporary yet. -// UNSUPPORTED: apple-clang-16, clang-19.1 - // <type_traits> // template<class T, class U> struct reference_constructs_from_temporary; diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp index 73cc4f3e29d5a..8319d9e1563fe 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp @@ -8,9 +8,6 @@ // REQUIRES: std-at-least-c++23 -// These compilers don't support std::reference_converts_from_temporary yet. -// UNSUPPORTED: apple-clang-16, clang-19.1 - // <type_traits> // template<class T, class U> struct reference_converts_from_temporary; diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp index 2dfbae9138864..12d778408d5ec 100644 --- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp +++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp @@ -19,11 +19,7 @@ #include "test_macros.h" void test() { - // FreeBSD ci use clang 19.1.1, which hasn't implement __reference_constructs_from_temporary. - // The static_assert inner std::make_from_tuple will not triggered. -#if __has_builtin(__reference_constructs_from_temporary) // expected-error@*:* {{static assertion failed}} -#endif // Turns to an error since C++26 (Disallow Binding a Returned Glvalue to a Temporary https://wg21.link/P2748R5). #if TEST_STD_VER >= 26 diff --git a/lld/test/wasm/runtime-relocations-himem.s b/lld/test/wasm/runtime-relocations-himem.s new file mode 100644 index 0000000000000..a12a93a6cb933 --- /dev/null +++ b/lld/test/wasm/runtime-relocations-himem.s @@ -0,0 +1,60 @@ +## Verifies runtime relocation code for addresses over 2gb works correctly. +## We have had issues with LEB encoding of address over 2gb in i32.const +## instruction leading to invalid binaries. + +# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s +# RUN: wasm-ld --global-base=2147483648 --experimental-pic --unresolved-symbols=import-dynamic -no-gc-sections --shared-memory --no-entry -o %t.wasm %t.o +# XUN: obj2yaml %t.wasm | FileCheck %s +# RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s -- + +.globl tls_sym +.globl data_sym +.globl _start +.globaltype __tls_base, i32 + +_start: + .functype _start () -> () + global.get __tls_base + i32.const tls_sym@TLSREL + i32.add + drop + i32.const data_sym + drop + end_function + +.section tls_sec,"T",@ +.p2align 2 +tls_sym: + .int32 0 + .int32 extern_sym + .size tls_sym, 8 + +.section data_sec,"",@ +.p2align 2 +data_sym: + .int32 0 + .int32 extern_sym + .size data_sym, 8 + +.section .custom_section.target_features,"",@ + .int8 2 + .int8 43 + .int8 7 + .ascii "atomics" + .int8 43 + .int8 11 + .ascii "bulk-memory" + +# CHECK: <__wasm_apply_data_relocs>: +# CHECK-EMPTY: +# CHECK-NEXT: i32.const -2147483636 +# CHECK-NEXT: global.get 0 +# CHECK-NEXT: i32.store 0 +# CHECK-NEXT: end + +# CHECK: <__wasm_apply_tls_relocs>: +# CHECK-EMPTY: +# CHECK-NEXT: i32.const -2147483644 +# CHECK-NEXT: global.get 0 +# CHECK-NEXT: i32.store 0 +# CHECK-NEXT: end diff --git a/lld/test/wasm/stack-first.test b/lld/test/wasm/stack-first.test index 72e1a006d5700..46e8c6ebc2381 100644 --- a/lld/test/wasm/stack-first.test +++ b/lld/test/wasm/stack-first.test @@ -8,6 +8,15 @@ RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/stack-first. RUN: wasm-ld -z stack-size=512 --stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o RUN: obj2yaml %t.wasm | FileCheck %s +; Check `--no-stack-first` +RUN: wasm-ld -z stack-size=512 --stack-first --no-stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o +RUN: obj2yaml %t.wasm | FileCheck %s --check-prefix=NOT-FIRST + +; Check that the default is no-stack-first +RUN: wasm-ld -z stack-size=512 --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o +RUN: obj2yaml %t.wasm | FileCheck %s --check-prefix=NOT-FIRST + + CHECK: - Type: GLOBAL CHECK-NEXT: Globals: CHECK-NEXT: - Index: 0 @@ -51,3 +60,19 @@ CHECK-NEXT: Index: 2 CHECK-NEXT: - Name: __heap_base CHECK-NEXT: Kind: GLOBAL CHECK-NEXT: Index: 3 + +NOT-FIRST: - Type: GLOBAL +NOT-FIRST-NEXT: Globals: +NOT-FIRST-NEXT: - Index: 0 +NOT-FIRST-NEXT: Type: I32 +NOT-FIRST-NEXT: Mutable: true +NOT-FIRST-NEXT: InitExpr: +NOT-FIRST-NEXT: Opcode: I32_CONST +NOT-FIRST-NEXT: Value: 1552 +NOT-FIRST-NEXT: - Index: 1 +NOT-FIRST-NEXT: Type: I32 +NOT-FIRST-NEXT: Mutable: false +NOT-FIRST-NEXT: InitExpr: +NOT-FIRST-NEXT: Opcode: I32_CONST +NOT-FIRST-NEXT: Value: 1024 + diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index 9c0e1b58e62f9..aad8095881bba 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -595,7 +595,7 @@ static void readConfigs(opt::InputArgList &args) { ctx.arg.shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck); ctx.arg.stripAll = args.hasArg(OPT_strip_all); ctx.arg.stripDebug = args.hasArg(OPT_strip_debug); - ctx.arg.stackFirst = args.hasArg(OPT_stack_first); + ctx.arg.stackFirst = args.hasFlag(OPT_stack_first, OPT_no_stack_first, false); ctx.arg.trace = args.hasArg(OPT_trace); ctx.arg.thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir); ctx.arg.thinLTOCachePolicy = CHECK( diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp index 44927e7a432bc..14e02e6009318 100644 --- a/lld/wasm/InputChunks.cpp +++ b/lld/wasm/InputChunks.cpp @@ -423,8 +423,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { bool is64 = ctx.arg.is64.value_or(false); bool generated = false; - unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST - : WASM_OPCODE_I32_CONST; unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD; @@ -451,8 +449,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { << " output offset=" << offset << "\n"); // Calculate the address at which to apply the relocation - writeU8(os, opcode_ptr_const, "CONST"); - writeSleb128(os, offset, "offset"); + writePtrConst(os, offset, is64, "offset"); // In PIC mode we need to add the __memory_base if (ctx.isPic) { @@ -466,8 +463,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { // Now figure out what we want to store at this location bool is64 = relocIs64(rel.Type); - unsigned opcode_reloc_const = - is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST; unsigned opcode_reloc_add = is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD; unsigned opcode_reloc_store = @@ -477,8 +472,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET"); writeUleb128(os, sym->getGOTIndex(), "global index"); if (rel.Addend) { - writeU8(os, opcode_reloc_const, "CONST"); - writeSleb128(os, rel.Addend, "addend"); + writePtrConst(os, rel.Addend, is64, "addend"); writeU8(os, opcode_reloc_add, "ADD"); } } else { @@ -491,8 +485,8 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const { baseSymbol = ctx.sym.tlsBase; writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET"); writeUleb128(os, baseSymbol->getGlobalIndex(), "base"); - writeU8(os, opcode_reloc_const, "CONST"); - writeSleb128(os, file->calcNewValue(rel, tombstone, this), "offset"); + writePtrConst(os, file->calcNewValue(rel, tombstone, this), is64, + "offset"); writeU8(os, opcode_reloc_add, "ADD"); } diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td index 2f699e2f68350..0ed69258d99aa 100644 --- a/lld/wasm/Options.td +++ b/lld/wasm/Options.td @@ -250,8 +250,9 @@ def no_entry: FF<"no-entry">, def no_shlib_sigcheck: FF<"no-shlib-sigcheck">, HelpText<"Do not check signatures of functions defined in shared libraries.">; -def stack_first: FF<"stack-first">, - HelpText<"Place stack at start of linear memory rather than after data">; +defm stack_first: B<"stack-first", + "Place stack at start of linear memory", + "Place the stack after static data refion (default)">; def table_base: JJ<"table-base=">, HelpText<"Table offset at which to place address taken functions (Defaults to 1)">; diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp index e1192706ea913..399a5084e6595 100644 --- a/lld/wasm/SyntheticSections.cpp +++ b/lld/wasm/SyntheticSections.cpp @@ -434,8 +434,6 @@ void GlobalSection::addInternalGOTEntry(Symbol *sym) { void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const { assert(!ctx.arg.extendedConst); bool is64 = ctx.arg.is64.value_or(false); - unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST - : WASM_OPCODE_I32_CONST; unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD; @@ -452,8 +450,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const { writeUleb128(os, ctx.sym.memoryBase->getGlobalIndex(), "__memory_base"); // Add the virtual address of the data symbol - writeU8(os, opcode_ptr_const, "CONST"); - writeSleb128(os, d->getVA(), "offset"); + writePtrConst(os, d->getVA(), is64, "offset"); } else if (auto *f = dyn_cast<FunctionSymbol>(sym)) { if (f->isStub) continue; @@ -462,8 +459,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const { writeUleb128(os, ctx.sym.tableBase->getGlobalIndex(), "__table_base"); // Add the table index to __table_base - writeU8(os, opcode_ptr_const, "CONST"); - writeSleb128(os, f->getTableIndex(), "offset"); + writePtrConst(os, f->getTableIndex(), is64, "offset"); } else { assert(isa<UndefinedData>(sym) || isa<SharedData>(sym)); continue; diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt index e3b72e94d4beb..01b5546fee00d 100644 --- a/lldb/CMakeLists.txt +++ b/lldb/CMakeLists.txt @@ -87,6 +87,12 @@ if (LLDB_ENABLE_PYTHON) set(LLDB_PYTHON_EXT_SUFFIX "_d${LLDB_PYTHON_EXT_SUFFIX}") endif() endif() + if(TARGET Python3::Python) + get_target_property(_Python3_LIB_PATH Python3::Python IMPORTED_LIBRARY_LOCATION) + if(_Python3_LIB_PATH) + get_filename_component(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME "${_Python3_LIB_PATH}" NAME) + endif() + endif() endif () if (LLDB_ENABLE_LUA) diff --git a/lldb/include/lldb/Target/InstrumentationRuntime.h b/lldb/include/lldb/Target/InstrumentationRuntime.h index a6121c24b9560..d2499528e97ab 100644 --- a/lldb/include/lldb/Target/InstrumentationRuntime.h +++ b/lldb/include/lldb/Target/InstrumentationRuntime.h @@ -73,6 +73,13 @@ class InstrumentationRuntime /// is guaranteed to be loaded. virtual void Activate() = 0; + /// \return true if `CheckIfRuntimeIsValid` should be called on all modules. + /// In this case the return value of `GetPatternForRuntimeLibrary` will be + /// ignored. Return false if `CheckIfRuntimeIsValid` should only be called + /// for modules whose name matches `GetPatternForRuntimeLibrary`. + /// + virtual bool MatchAllModules() { return false; } + public: static void ModulesDidLoad(lldb_private::ModuleList &module_list, Process *process, diff --git a/lldb/include/lldb/Utility/Stream.h b/lldb/include/lldb/Utility/Stream.h index 82774d56922a9..13455552131da 100644 --- a/lldb/include/lldb/Utility/Stream.h +++ b/lldb/include/lldb/Utility/Stream.h @@ -300,6 +300,12 @@ class Stream { /// The current indentation level. unsigned GetIndentLevel() const; + /// Set the current indentation level. + /// + /// \param[in] level + /// The new indentation level. + void SetIndentLevel(unsigned level); + /// Indent the current line in the stream. /// /// Indent the current line using the current indentation level and print an @@ -315,6 +321,20 @@ class Stream { /// Increment the current indentation level. void IndentMore(unsigned amount = 2); + struct IndentScope { + IndentScope(Stream &stream) + : m_stream(stream), m_original_indent_level(stream.GetIndentLevel()) {} + ~IndentScope() { m_stream.SetIndentLevel(m_original_indent_level); } + + private: + Stream &m_stream; + unsigned m_original_indent_level; + }; + + /// Create an indentation scope that restores the original indent level when + /// the object goes out of scope (RAII). + IndentScope MakeIndentScope(unsigned indent_amount = 2); + /// Output an offset value. /// /// Put an offset \a uval out to the stream using the printf format in \a @@ -364,12 +384,6 @@ class Stream { /// address and pointer values. void SetAddressByteSize(uint32_t addr_size); - /// Set the current indentation level. - /// - /// \param[in] level - /// The new indentation level. - void SetIndentLevel(unsigned level); - /// Output a SLEB128 number to the stream. /// /// Put an SLEB128 \a uval out to the stream using the printf format in \a diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py index 96c7b3987d8a1..024c9f1c7e435 100644 --- a/lldb/packages/Python/lldbsuite/test/builders/builder.py +++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py @@ -258,6 +258,7 @@ def _getDebugInfoArgs(self, debug_info): "gmodules": {"MAKE_DSYM": "NO", "MAKE_GMODULES": "YES"}, "debug_names": {"MAKE_DEBUG_NAMES": "YES"}, "dwp": {"MAKE_DSYM": "NO", "MAKE_DWP": "YES"}, + "pdb": {"MAKE_PDB": "YES"}, } # Collect all flags, with later options overriding earlier ones diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index b92de941c4124..8c1eea97620e2 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -1791,6 +1791,11 @@ def no_reason(_): if can_replicate ] + # PDB is off by default, because it has a lot of failures right now. + # See llvm.org/pr149498 + if original_testcase.TEST_WITH_PDB_DEBUG_INFO: + dbginfo_categories.append("pdb") + xfail_for_debug_info_cat_fn = getattr( attrvalue, "__xfail_for_debug_info_cat_fn__", no_reason ) @@ -1878,6 +1883,13 @@ class TestBase(Base, metaclass=LLDBTestCaseFactory): # test multiple times with various debug info types. NO_DEBUG_INFO_TESTCASE = False + TEST_WITH_PDB_DEBUG_INFO = False + """ + Subclasses can set this to True to test with PDB in addition to the other debug info + types. This id off by default because many tests will fail due to missing functionality in PDB. + See llvm.org/pr149498. + """ + def generateSource(self, source): template = source + ".template" temp = os.path.join(self.getSourceDir(), template) diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules index 28cae54776ac8..b3822db162a0b 100644 --- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules +++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules @@ -249,6 +249,10 @@ ifeq ($(CC_TYPE), clang) MODULE_DEBUG_INFO_FLAGS += -gmodules endif +ifeq "$(MAKE_PDB)" "YES" + DEBUG_INFO_FLAG ?= -g -gcodeview +endif + # If the OS is Windows, we need to pass -gdwarf to clang, otherwise it will build # with codeview by default but all the tests rely on dwarf. ifeq "$(OS)" "Windows_NT" diff --git a/lldb/packages/Python/lldbsuite/test/test_categories.py b/lldb/packages/Python/lldbsuite/test/test_categories.py index 1f6e8a78e0c0d..b8a764fb3349a 100644 --- a/lldb/packages/Python/lldbsuite/test/test_categories.py +++ b/lldb/packages/Python/lldbsuite/test/test_categories.py @@ -12,7 +12,13 @@ # Key: Category name # Value: should be used in lldbtest's debug-info replication -debug_info_categories = {"dwarf": True, "dwo": True, "dsym": True, "gmodules": False} +debug_info_categories = { + "dwarf": True, + "dwo": True, + "dsym": True, + "pdb": False, + "gmodules": False, +} all_categories = { "basic_process": "Basic process execution sniff tests.", @@ -34,6 +40,7 @@ "lldb-dap": "Tests for the Debug Adapter Protocol with lldb-dap", "llgs": "Tests for the gdb-server functionality of lldb-server", "msvcstl": "Test for MSVC STL data formatters", + "pdb": "Tests that can be run with PDB debug information", "pexpect": "Tests requiring the pexpect library to be available", "objc": "Tests related to the Objective-C programming language support", "pyapi": "Tests related to the Python API", @@ -65,6 +72,8 @@ def is_supported_on_platform(category, platform, compiler_path): if platform not in ["darwin", "macosx", "ios", "watchos", "tvos", "bridgeos"]: return False return gmodules.is_compiler_clang_with_gmodules(compiler_path) + elif category == "pdb": + return platform == "windows" return True diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index d892c01f0bc71..ac550962cfb85 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -32,6 +32,10 @@ # timeout by a factor of 10 if ASAN is enabled. DEFAULT_TIMEOUT = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1) +# See lldbtest.Base.spawnSubprocess, which should help ensure any processes +# created by the DAP client are terminated correctly when the test ends. +SpawnHelperCallback = Callable[[str, List[str], List[str]], subprocess.Popen] + ## DAP type references @@ -191,14 +195,16 @@ def __init__( self, recv: BinaryIO, send: BinaryIO, - init_commands: list[str], - log_file: Optional[TextIO] = None, + init_commands: Optional[List[str]] = None, + log_file: Optional[str] = None, + spawn_helper: Optional[SpawnHelperCallback] = None, ): # For debugging test failures, try setting `trace_file = sys.stderr`. self.trace_file: Optional[TextIO] = None self.log_file = log_file self.send = send self.recv = recv + self.spawn_helper = spawn_helper # Packets that have been received and processed but have not yet been # requested by a test case. @@ -211,7 +217,7 @@ def __init__( self._recv_thread = threading.Thread(target=self._read_packet_thread) # session state - self.init_commands = init_commands + self.init_commands = init_commands if init_commands else [] self.exit_status: Optional[int] = None self.capabilities: Dict = {} self.initialized: bool = False @@ -310,11 +316,6 @@ def collect_output( output += self.get_output(category, clear=clear) return output - def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]): - with self.recv_condition: - self.recv_packets.append(packet) - self.recv_condition.notify() - def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool: """Handles an incoming packet. @@ -460,22 +461,11 @@ def _handle_reverse_request(self, request: Request) -> None: self.reverse_requests.append(request) arguments = request.get("arguments") if request["command"] == "runInTerminal" and arguments is not None: - in_shell = arguments.get("argsCanBeInterpretedByShell", False) - print("spawning...", arguments["args"]) - proc = subprocess.Popen( - arguments["args"], - env=arguments.get("env", {}), - cwd=arguments.get("cwd", None), - stdin=subprocess.DEVNULL, - stdout=sys.stderr, - stderr=sys.stderr, - shell=in_shell, - ) - body = {} - if in_shell: - body["shellProcessId"] = proc.pid - else: - body["processId"] = proc.pid + assert self.spawn_helper is not None, "Not configured to spawn subprocesses" + [exe, *args] = arguments["args"] + env = [f"{k}={v}" for k, v in arguments.get("env", {}).items()] + proc = self.spawn_helper(exe, args, env) + body = {"processId": proc.pid} self.send_packet( { "type": "response", @@ -1501,12 +1491,14 @@ def request_setInstructionBreakpoints(self, memory_reference=[]): class DebugAdapterServer(DebugCommunication): def __init__( self, + *, executable: Optional[str] = None, connection: Optional[str] = None, - init_commands: list[str] = [], - log_file: Optional[TextIO] = None, - env: Optional[dict[str, str]] = None, - additional_args: list[str] = [], + init_commands: Optional[list[str]] = None, + log_file: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + additional_args: Optional[List[str]] = None, + spawn_helper: Optional[SpawnHelperCallback] = None, ): self.process = None self.connection = None @@ -1532,13 +1524,21 @@ def __init__( s = socket.create_connection((host.strip("[]"), int(port))) else: raise ValueError("invalid connection: {}".format(connection)) - DebugCommunication.__init__( - self, s.makefile("rb"), s.makefile("wb"), init_commands, log_file + super().__init__( + s.makefile("rb"), + s.makefile("wb"), + init_commands, + log_file, + spawn_helper, ) self.connection = connection else: - DebugCommunication.__init__( - self, self.process.stdout, self.process.stdin, init_commands, log_file + super().__init__( + self.process.stdout, + self.process.stdin, + init_commands, + log_file, + spawn_helper, ) @classmethod @@ -1546,14 +1546,14 @@ def launch( cls, *, executable: str, - env: Optional[dict[str, str]] = None, - log_file: Optional[TextIO] = None, + env: Optional[Dict[str, str]] = None, + log_file: Optional[str] = None, connection: Optional[str] = None, connection_timeout: Optional[int] = None, - additional_args: list[str] = [], + additional_args: Optional[List[str]] = None, ) -> tuple[subprocess.Popen, Optional[str]]: adapter_env = os.environ.copy() - if env is not None: + if env: adapter_env.update(env) if log_file: @@ -1561,7 +1561,8 @@ def launch( args = [executable] # Add additional arguments first (like --no-lldbinit) - args.extend(additional_args) + if additional_args: + args.extend(additional_args) if connection is not None: args.append("--connection") diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py index c6c4a3e2a4e1e..71ca60ebe8d34 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py @@ -39,6 +39,7 @@ def create_debug_adapter( log_file=log_file_path, env=lldbDAPEnv, additional_args=additional_args or [], + spawn_helper=self.spawnSubprocess, ) def build_and_create_debug_adapter( diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp index 88a02dce35b9d..9133359fbf537 100644 --- a/lldb/source/Commands/CommandObjectFrame.cpp +++ b/lldb/source/Commands/CommandObjectFrame.cpp @@ -265,6 +265,29 @@ class CommandObjectFrameSelect : public CommandObjectParsed { Options *GetOptions() override { return &m_options; } +private: + void SkipHiddenFrames(Thread &thread, uint32_t frame_idx) { + uint32_t candidate_idx = frame_idx; + const unsigned max_depth = 12; + for (unsigned num_try = 0; num_try < max_depth; ++num_try) { + if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) { + candidate_idx = UINT32_MAX; + break; + } + candidate_idx += *m_options.relative_frame_offset; + if (auto candidate_sp = thread.GetStackFrameAtIndex(candidate_idx)) { + if (candidate_sp->IsHidden()) + continue; + // Now candidate_idx is the first non-hidden frame. + break; + } + candidate_idx = UINT32_MAX; + break; + }; + if (candidate_idx != UINT32_MAX) + m_options.relative_frame_offset = candidate_idx - frame_idx; + } + protected: void DoExecute(Args &command, CommandReturnObject &result) override { // No need to check "thread" for validity as eCommandRequiresThread ensures @@ -278,28 +301,13 @@ class CommandObjectFrameSelect : public CommandObjectParsed { if (frame_idx == UINT32_MAX) frame_idx = 0; - // If moving up/down by one, skip over hidden frames. - if (*m_options.relative_frame_offset == 1 || - *m_options.relative_frame_offset == -1) { - uint32_t candidate_idx = frame_idx; - const unsigned max_depth = 12; - for (unsigned num_try = 0; num_try < max_depth; ++num_try) { - if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) { - candidate_idx = UINT32_MAX; - break; - } - candidate_idx += *m_options.relative_frame_offset; - if (auto candidate_sp = thread->GetStackFrameAtIndex(candidate_idx)) { - if (candidate_sp->IsHidden()) - continue; - // Now candidate_idx is the first non-hidden frame. - break; - } - candidate_idx = UINT32_MAX; - break; - }; - if (candidate_idx != UINT32_MAX) - m_options.relative_frame_offset = candidate_idx - frame_idx; + // If moving up/down by one, skip over hidden frames, unless we started + // in a hidden frame. + if ((*m_options.relative_frame_offset == 1 || + *m_options.relative_frame_offset == -1)) { + if (auto current_frame_sp = thread->GetStackFrameAtIndex(frame_idx); + !current_frame_sp->IsHidden()) + SkipHiddenFrames(*thread, frame_idx); } if (*m_options.relative_frame_offset < 0) { diff --git a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp index ff37b48d86ca8..a5547a4699ca9 100644 --- a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp +++ b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp @@ -798,6 +798,8 @@ bool ABISysV_riscv::RegisterIsCalleeSaved(const RegisterInfo *reg_info) { .Cases({"f8", "f9", "f18", "f19", "f20", "f21", "f22", "f23"}, is_hw_fp) .Cases({"f24", "f25", "f26", "f27"}, is_hw_fp) + // vlenb is constant and needed for vector unwinding. + .Case("vlenb", true) .Default(false); return is_callee_saved; diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt index 1717b0a896669..727c8290bceb4 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt @@ -1,10 +1,13 @@ add_lldb_library(lldbPluginCPPRuntime CPPLanguageRuntime.cpp + VerboseTrapFrameRecognizer.cpp LINK_LIBS lldbCore lldbSymbol lldbTarget + CLANG_LIBS + clangCodeGen ) add_subdirectory(ItaniumABI) diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp index 21a5ebe53073a..913678b629f2f 100644 --- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp @@ -12,6 +12,7 @@ #include <memory> #include "CPPLanguageRuntime.h" +#include "VerboseTrapFrameRecognizer.h" #include "llvm/ADT/StringRef.h" @@ -107,12 +108,15 @@ class LibCXXFrameRecognizer : public StackFrameRecognizer { CPPLanguageRuntime::CPPLanguageRuntime(Process *process) : LanguageRuntime(process) { - if (process) + if (process) { process->GetTarget().GetFrameRecognizerManager().AddRecognizer( StackFrameRecognizerSP(new LibCXXFrameRecognizer()), {}, std::make_shared<RegularExpression>("^std::__[^:]*::"), /*mangling_preference=*/Mangled::ePreferDemangledWithoutArguments, /*first_instruction_only=*/false); + + RegisterVerboseTrapFrameRecognizer(*process); + } } bool CPPLanguageRuntime::IsAllowedRuntimeValue(ConstString name) { diff --git a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp similarity index 81% rename from lldb/source/Target/VerboseTrapFrameRecognizer.cpp rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp index 03ab58b8c59a9..2b6bf2cd470e6 100644 --- a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp @@ -1,4 +1,4 @@ -#include "lldb/Target/VerboseTrapFrameRecognizer.h" +#include "VerboseTrapFrameRecognizer.h" #include "lldb/Core/Module.h" #include "lldb/Symbol/Function.h" @@ -95,33 +95,14 @@ VerboseTrapFrameRecognizer::RecognizeFrame(lldb::StackFrameSP frame_sp) { if (func_name.empty()) return {}; - static auto trap_regex = - llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str()); - SmallVector<llvm::StringRef, 3> matches; - std::string regex_err_msg; - if (!trap_regex.match(func_name, &matches, ®ex_err_msg)) { - LLDB_LOGF(GetLog(LLDBLog::Unwind), - "Failed to parse match trap regex for '%s': %s", func_name.data(), - regex_err_msg.c_str()); - - return {}; - } - - // For `__clang_trap_msg$category$message$` we expect 3 matches: - // 1. entire string - // 2. category - // 3. message - if (matches.size() != 3) { - LLDB_LOGF(GetLog(LLDBLog::Unwind), - "Unexpected function name format. Expected '<trap prefix>$<trap " - "category>$<trap message>'$ but got: '%s'.", - func_name.data()); - + auto maybe_trap_reason = + clang::CodeGen::DemangleTrapReasonInDebugInfo(func_name); + if (!maybe_trap_reason.has_value()) { + LLDB_LOGF(GetLog(LLDBLog::Unwind), "Failed to demangle '%s' as trap reason", + func_name.str().c_str()); return {}; } - - auto category = matches[1]; - auto message = matches[2]; + auto [category, message] = maybe_trap_reason.value(); std::string stop_reason = category.empty() ? "<empty category>" : category.str(); diff --git a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h similarity index 63% rename from lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h index 7e045760a28be..7d7020f63c8d2 100644 --- a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h +++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h @@ -1,5 +1,13 @@ -#ifndef LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H -#define LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H +//===-- VerboseTrapFrameRecognizer.h --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H +#define LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H #include "lldb/Target/StackFrameRecognizer.h" @@ -36,4 +44,4 @@ class VerboseTrapFrameRecognizer : public StackFrameRecognizer { } // namespace lldb_private -#endif // LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H +#endif // LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp index 49841e7307443..e06e69fb08305 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp @@ -2735,9 +2735,8 @@ static void ApplyELF64ABS64Relocation(Symtab *symtab, ELFRelocation &rel, // ObjectFileELF creates a WritableDataBuffer in CreateInstance. WritableDataBuffer *data_buffer = llvm::cast<WritableDataBuffer>(data_buffer_sp.get()); - uint64_t *dst = reinterpret_cast<uint64_t *>( - data_buffer->GetBytes() + rel_section->GetFileOffset() + - ELFRelocation::RelocOffset64(rel)); + void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() + + ELFRelocation::RelocOffset64(rel); uint64_t val_offset = value + ELFRelocation::RelocAddend64(rel); memcpy(dst, &val_offset, sizeof(uint64_t)); } @@ -2762,9 +2761,8 @@ static void ApplyELF64ABS32Relocation(Symtab *symtab, ELFRelocation &rel, // ObjectFileELF creates a WritableDataBuffer in CreateInstance. WritableDataBuffer *data_buffer = llvm::cast<WritableDataBuffer>(data_buffer_sp.get()); - uint32_t *dst = reinterpret_cast<uint32_t *>( - data_buffer->GetBytes() + rel_section->GetFileOffset() + - ELFRelocation::RelocOffset32(rel)); + void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() + + ELFRelocation::RelocOffset32(rel); memcpy(dst, &truncated_addr, sizeof(uint32_t)); } } diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt index b7788e80eecac..8e6d51efad1f3 100644 --- a/lldb/source/Target/CMakeLists.txt +++ b/lldb/source/Target/CMakeLists.txt @@ -80,7 +80,6 @@ add_lldb_library(lldbTarget UnixSignals.cpp UnwindAssembly.cpp UnwindLLDB.cpp - VerboseTrapFrameRecognizer.cpp ADDITIONAL_HEADER_DIRS ${LLDB_INCLUDE_DIR}/lldb/Target diff --git a/lldb/source/Target/InstrumentationRuntime.cpp b/lldb/source/Target/InstrumentationRuntime.cpp index 7e58e8bf26cb1..d9800a8541f4e 100644 --- a/lldb/source/Target/InstrumentationRuntime.cpp +++ b/lldb/source/Target/InstrumentationRuntime.cpp @@ -55,7 +55,8 @@ void InstrumentationRuntime::ModulesDidLoad( return IterationAction::Continue; const RegularExpression &runtime_regex = GetPatternForRuntimeLibrary(); - if (runtime_regex.Execute(file_spec.GetFilename().GetCString()) || + if (MatchAllModules() || + runtime_regex.Execute(file_spec.GetFilename().GetCString()) || module_sp->IsExecutable()) { if (CheckIfRuntimeIsValid(module_sp)) { SetRuntimeModuleSP(module_sp); diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index fb9e7eb5ed1bd..42ce198a283da 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -65,7 +65,6 @@ #include "lldb/Target/ThreadPlanCallFunction.h" #include "lldb/Target/ThreadPlanStack.h" #include "lldb/Target/UnixSignals.h" -#include "lldb/Target/VerboseTrapFrameRecognizer.h" #include "lldb/Utility/AddressableBits.h" #include "lldb/Utility/Event.h" #include "lldb/Utility/LLDBLog.h" @@ -513,7 +512,6 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp, // We should have a plugin do the registration instead, for example, a // common C LanguageRuntime plugin. RegisterAssertFrameRecognizer(this); - RegisterVerboseTrapFrameRecognizer(*this); } Process::~Process() { diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 1e43094421f0a..a23091ad09c6d 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -3962,9 +3962,7 @@ void Target::StopHook::GetDescription(Stream &s, return; } - unsigned indent_level = s.GetIndentLevel(); - - s.SetIndentLevel(indent_level + 2); + auto indent_scope = s.MakeIndentScope(); s.Printf("Hook: %" PRIu64 "\n", GetID()); if (m_active) @@ -3978,19 +3976,17 @@ void Target::StopHook::GetDescription(Stream &s, if (m_specifier_sp) { s.Indent(); s.PutCString("Specifier:\n"); - s.SetIndentLevel(indent_level + 4); + auto indent_scope = s.MakeIndentScope(); m_specifier_sp->GetDescription(&s, level); - s.SetIndentLevel(indent_level + 2); } if (m_thread_spec_up) { StreamString tmp; s.Indent("Thread:\n"); m_thread_spec_up->GetDescription(&tmp, level); - s.SetIndentLevel(indent_level + 4); + auto indent_scope = s.MakeIndentScope(); s.Indent(tmp.GetString()); s.PutCString("\n"); - s.SetIndentLevel(indent_level + 2); } GetSubclassDescription(s, level); } @@ -4003,14 +3999,13 @@ void Target::StopHookCommandLine::GetSubclassDescription( s.PutCString(m_commands.GetStringAtIndex(0)); return; } - s.Indent("Commands: \n"); - s.SetIndentLevel(s.GetIndentLevel() + 4); + s.Indent("Commands:\n"); + auto indent_scope = s.MakeIndentScope(4); uint32_t num_commands = m_commands.GetSize(); for (uint32_t i = 0; i < num_commands; i++) { s.Indent(m_commands.GetStringAtIndex(i)); s.PutCString("\n"); } - s.SetIndentLevel(s.GetIndentLevel() - 4); } // Target::StopHookCommandLine @@ -4145,7 +4140,7 @@ void Target::StopHookScripted::GetSubclassDescription( return; s.Indent("Args:\n"); - s.SetIndentLevel(s.GetIndentLevel() + 4); + auto indent_scope = s.MakeIndentScope(4); auto print_one_element = [&s](llvm::StringRef key, StructuredData::Object *object) { @@ -4155,8 +4150,6 @@ void Target::StopHookScripted::GetSubclassDescription( }; as_dict->ForEach(print_one_element); - - s.SetIndentLevel(s.GetIndentLevel() - 4); } static constexpr OptionEnumValueElement g_dynamic_value_types[] = { diff --git a/lldb/source/Utility/RegisterValue.cpp b/lldb/source/Utility/RegisterValue.cpp index 12c349a143c0f..8b2af4e3d4f0e 100644 --- a/lldb/source/Utility/RegisterValue.cpp +++ b/lldb/source/Utility/RegisterValue.cpp @@ -206,7 +206,7 @@ Status RegisterValue::SetValueFromData(const RegisterInfo ®_info, int128.x[0] = data2; int128.x[1] = data1; } - SetUInt128(llvm::APInt(128, 2, int128.x)); + SetUInt128(llvm::APInt(128, int128.x)); } break; case eEncodingIEEE754: @@ -596,8 +596,10 @@ llvm::APInt RegisterValue::GetAsUInt128(const llvm::APInt &fail_value, case 8: case 16: return llvm::APInt( - BITWIDTH_INT128, NUM_OF_WORDS_INT128, - (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x); + BITWIDTH_INT128, + llvm::ArrayRef( + (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x, + NUM_OF_WORDS_INT128)); } } break; } diff --git a/lldb/source/Utility/Stream.cpp b/lldb/source/Utility/Stream.cpp index 89dce9fb0e1f7..e9632c3e1fc1f 100644 --- a/lldb/source/Utility/Stream.cpp +++ b/lldb/source/Utility/Stream.cpp @@ -202,6 +202,14 @@ void Stream::IndentLess(unsigned amount) { m_indent_level = 0; } +// Create an indentation scope that restores the original indent level when the +// object goes out of scope (RAII). +Stream::IndentScope Stream::MakeIndentScope(unsigned indent_amount) { + IndentScope indent_scope(*this); + IndentMore(indent_amount); + return indent_scope; +} + // Get the address size in bytes uint32_t Stream::GetAddressByteSize() const { return m_addr_size; } diff --git a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py index 50efecbc88c36..bed129a7a7a8c 100644 --- a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py +++ b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py @@ -15,7 +15,7 @@ class TestWeakSymbolsInExpressions(TestBase): NO_DEBUG_INFO_TESTCASE = True @skipUnlessDarwin - @skipIf(compiler="clang", compiler_version=["<", "7.0"]) + @skipIf(compiler="clang", compiler_version=["<", "19.0"]) def test_weak_symbol_in_expr(self): """Tests that we can refer to weak symbols in expressions.""" self.build() diff --git a/lldb/test/API/commands/frame/select-hidden/Makefile b/lldb/test/API/commands/frame/select-hidden/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/commands/frame/select-hidden/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py new file mode 100644 index 0000000000000..698447b552877 --- /dev/null +++ b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py @@ -0,0 +1,32 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class NavigateHiddenFrameTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + @add_test_categories(["libc++"]) + def test(self): + """Test going up/down a backtrace but we started in a hidden frame.""" + self.build() + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec("main.cpp") + ) + # up + self.assertIn("__impl2", thread.selected_frame.GetFunctionName()) + self.expect("up") + self.assertIn("__impl1", thread.selected_frame.GetFunctionName()) + self.expect("up") + self.assertIn("__impl", thread.selected_frame.GetFunctionName()) + self.expect("up") + self.assertIn("non_impl", thread.selected_frame.GetFunctionName()) + + # Back down again. + self.expect("down") + self.assertIn("__impl", thread.selected_frame.GetFunctionName()) + self.expect("down") + self.assertIn("__impl1", thread.selected_frame.GetFunctionName()) + self.expect("down") + self.assertIn("__impl2", thread.selected_frame.GetFunctionName()) diff --git a/lldb/test/API/commands/frame/select-hidden/main.cpp b/lldb/test/API/commands/frame/select-hidden/main.cpp new file mode 100644 index 0000000000000..dc97abb6323a4 --- /dev/null +++ b/lldb/test/API/commands/frame/select-hidden/main.cpp @@ -0,0 +1,13 @@ +namespace std { +namespace __1 { +static const char *__impl2() { return "Break here"; } +static const char *__impl1() { return __impl2(); } +static const char *__impl() { return __impl1(); } +static const char *non_impl() { return __impl(); } +} // namespace __1 +} // namespace std + +int main() { + std::__1::non_impl(); + __builtin_debugtrap(); +} diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py index 2f942da604ff2..eeb5d1b554b01 100644 --- a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py +++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py @@ -9,7 +9,7 @@ class LibCxxInternalsRecognizerTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True @add_test_categories(["libc++"]) - @skipIf(compiler="clang", compiler_version=["<=", "19.0"]) + @skipIf(compiler="clang", compiler_version=["<", "21.0"]) def test_frame_recognizer(self): """Test that implementation details of libc++ are hidden""" self.build() diff --git a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py index 142d27ddad37f..f3558f62d51f8 100644 --- a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py +++ b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py @@ -16,6 +16,7 @@ def setUp(self): self.line = line_number("main.m", "// Set breakpoint 0 here.") @skipIf(macos_version=["<", "10.12"]) + @skipIf(compiler="clang", compiler_version=["<", "19.0"]) def test_expr(self): self.build() exe = self.getBuildArtifact("a.out") diff --git a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py index 3be064ae7d5f8..657a7103ee989 100644 --- a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py +++ b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py @@ -6,6 +6,7 @@ class TestCase(TestBase): @no_debug_info_test + @skipIf(compiler="clang", compiler_version=["<", "19.0"]) def test_conflicting_properties(self): """Tests receiving two properties with the same name from modules.""" self.build() diff --git a/lldb/test/API/test_utils/pdb/Makefile b/lldb/test/API/test_utils/pdb/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/test_utils/pdb/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/test_utils/pdb/TestPdb.py b/lldb/test/API/test_utils/pdb/TestPdb.py new file mode 100644 index 0000000000000..bd3a9d0c34ab3 --- /dev/null +++ b/lldb/test/API/test_utils/pdb/TestPdb.py @@ -0,0 +1,18 @@ +""" +Test PDB enabled tests +""" + +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * + + +class TestBuildMethod(TestBase): + TEST_WITH_PDB_DEBUG_INFO = True + + def test(self): + self.build() + self.assertTrue(self.dbg.CreateTarget(self.getBuildArtifact())) + if self.getDebugInfo() == "pdb": + self.expect( + "target modules dump symfile", patterns=["SymbolFile (native-)?pdb"] + ) diff --git a/lldb/test/API/test_utils/pdb/main.cpp b/lldb/test/API/test_utils/pdb/main.cpp new file mode 100644 index 0000000000000..76e8197013aab --- /dev/null +++ b/lldb/test/API/test_utils/pdb/main.cpp @@ -0,0 +1 @@ +int main() { return 0; } diff --git a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py index 09e3f62f0eead..19f88d88c2ff4 100644 --- a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py +++ b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py @@ -3,17 +3,15 @@ """ -import dap_server from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil import lldbdap_testcase -import subprocess import time import os -class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase): +class TestDAP_disconnect(lldbdap_testcase.DAPTestCaseBase): source = "main.cpp" def disconnect_and_assert_no_output_printed(self): @@ -67,10 +65,11 @@ def test_attach(self): lambda: self.run_platform_command("rm %s" % (sync_file_path)) ) - self.process = subprocess.Popen([program, sync_file_path]) + proc = self.spawnSubprocess(program, [sync_file_path]) lldbutil.wait_for_file_on_target(self, sync_file_path) - self.attach(pid=self.process.pid, disconnectAutomatically=False) + self.attach(pid=proc.pid, disconnectAutomatically=False, stopOnEntry=True) + self.continue_to_next_stop() response = self.dap_server.request_evaluate("wait_for_attach = false;") self.assertTrue(response["success"]) diff --git a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py index 12b321cf42778..3c53cf2ed3460 100644 --- a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py +++ b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py @@ -37,7 +37,7 @@ def cleanup(): def run_debug_session(self, connection, name, sleep_seconds_in_middle=None): self.dap_server = dap_server.DebugAdapterServer( - connection=connection, + connection=connection, spawn_helper=self.spawnSubprocess ) program = self.getBuildArtifact("a.out") source = "main.c" @@ -94,6 +94,7 @@ def test_server_interrupt(self): (process, connection) = self.start_server(connection="listen://localhost:0") self.dap_server = dap_server.DebugAdapterServer( connection=connection, + spawn_helper=self.spawnSubprocess, ) program = self.getBuildArtifact("a.out") source = "main.c" diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt index 513d1ec493ee1..818dff58aceeb 100644 --- a/lldb/test/CMakeLists.txt +++ b/lldb/test/CMakeLists.txt @@ -202,7 +202,7 @@ if(TARGET clang) else() # We require libcxx for the test suite, so if we aren't building it, # provide a helpful error about how to resolve the situation. - if(NOT LLDB_HAS_LIBCXX) + if(LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS AND NOT LLDB_HAS_LIBCXX) message(SEND_ERROR "LLDB test suite requires libc++, but it is currently disabled. " "Please add `libcxx` to `LLVM_ENABLE_RUNTIMES` or disable tests via " diff --git a/lldb/test/Shell/DAP/TestClientLauncher.test b/lldb/test/Shell/DAP/TestClientLauncher.test new file mode 100644 index 0000000000000..a79a940da5a98 --- /dev/null +++ b/lldb/test/Shell/DAP/TestClientLauncher.test @@ -0,0 +1,2 @@ +# RUN: lldb-dap --client vscode-url -- /path/to/foo | FileCheck %s +# CHECK: vscode://llvm-vs-code-extensions.lldb-dap/start?program=%2Fpath%2Fto%2Ffoo diff --git a/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test new file mode 100644 index 0000000000000..a9557801cc134 --- /dev/null +++ b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test @@ -0,0 +1,36 @@ +# Test format (e.g., indentation) when printing the list of stop hooks. +# +# RUN: %lldb -b -s %s | FileCheck %s --match-full-lines --strict-whitespace + +# Create some stop hooks +target stop-hook add -o 'print "Hello"' --auto-continue true --at-initial-stop true +target stop-hook add -o 'print "world,"' -o 'print "nice"' --file 'my_file' +target stop-hook add -o 'print "weather!"' --classname 'MyClass' --thread-name 'my_thread' + +# Print hooks +target stop-hook list + +# CHECK:(lldb) target stop-hook list +# CHECK:Hook: 1 +# CHECK: State: enabled +# CHECK: AutoContinue on +# CHECK: Commands: +# CHECK: print "Hello" +# CHECK-EMPTY: +# CHECK:Hook: 2 +# CHECK: State: enabled +# CHECK: Specifier: +# CHECK: File: my_file. +# CHECK: Commands: +# CHECK: print "world," +# CHECK: print "nice" +# CHECK-EMPTY: +# CHECK:Hook: 3 +# CHECK: State: enabled +# CHECK: Specifier: +# CHECK: Class name: MyClass. +# CHECK: Thread: +# CHECK: thread name: "my_thread" +# CHECK: Commands: +# CHECK: print "weather!" +# CHECK-EMPTY: diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m new file mode 100644 index 0000000000000..83a829a8c2fdd --- /dev/null +++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m @@ -0,0 +1,4 @@ +int main() { + __builtin_verbose_trap("Foo", "Bar"); + return 0; +} diff --git a/lldb/test/Shell/Recognizer/registration-unique.test b/lldb/test/Shell/Recognizer/registration-unique.test new file mode 100644 index 0000000000000..34400d9a27575 --- /dev/null +++ b/lldb/test/Shell/Recognizer/registration-unique.test @@ -0,0 +1,56 @@ +# UNSUPPORTED: system-windows + +# Checks that the recognizers that should work across language runtimes +# are only registered once with the target. + +# RUN: split-file %s %t + +# RUN: %clang_host %t/main.cpp -g -o %t/cpp.out +# RUN: %lldb -b -s %t/commands.input %t/cpp.out | FileCheck %s + +# RUN: %clang_host -x objective-c++ %t/main.mm -g -o %t/objcxx.out +# RUN: %lldb -b -s %t/commands.input %t/objcxx.out | FileCheck %s + +# RUN: %clang_host %t/main.c -g -o %t/c.out +# RUN: %lldb -b -s %t/commands.input %t/c.out | FileCheck %s + +# RUN: %clang_host -x objective-c %t/main.m -g -o %t/objc.out +# RUN: %lldb -b -s %t/commands.input %t/objc.out | FileCheck %s + +#--- main.m +int main() {} + +#--- main.c +int main() {} + +#--- main.mm +int main() {} + +#--- main.cpp +int main() {} + +#--- commands.input + +b main +frame recognizer list +run +frame recognizer list +continue +run +frame recognizer list + +# CHECK: frame recognizer list +# CHECK-NEXT: no matching results found. + +# CHECK: frame recognizer list +# CHECK-DAG: Verbose Trap StackFrame Recognizer +# CHECK-DAG: Assert StackFrame Recognizer +# CHECK-NOT: Verbose Trap StackFrame Recognizer +# CHECK-NOT: Assert StackFrame Recognizer + +# FIXME: avoid duplicate frame recognizers in the target: https://github.com/llvm/llvm-project/issues/166341 +# CHECK: frame recognizer list +# CHECK-DAG: Verbose Trap StackFrame Recognizer +# CHECK-DAG: Assert StackFrame Recognizer +# CHECK-DAG: Verbose Trap StackFrame Recognizer +# CHECK-DAG: Assert StackFrame Recognizer diff --git a/lldb/test/Shell/Recognizer/verbose_trap-objc.test b/lldb/test/Shell/Recognizer/verbose_trap-objc.test new file mode 100644 index 0000000000000..789e79c9542c5 --- /dev/null +++ b/lldb/test/Shell/Recognizer/verbose_trap-objc.test @@ -0,0 +1,12 @@ +# REQUIRES: system-darwin +# +# RUN: %clangxx_host -x objective-c -g %S/Inputs/verbose_trap.m -o %t.out +# RUN: %lldb -b -s %s %t.out | FileCheck %s + +run +# CHECK: thread #{{.*}}stop reason = Foo: Bar +frame info +# CHECK: frame #{{.*}}`main at verbose_trap.m +frame recognizer info 0 +# CHECK: frame 0 is recognized by Verbose Trap StackFrame Recognizer +q diff --git a/lldb/tools/driver/CMakeLists.txt b/lldb/tools/driver/CMakeLists.txt index 67956af7fe3fb..efe51506f3545 100644 --- a/lldb/tools/driver/CMakeLists.txt +++ b/lldb/tools/driver/CMakeLists.txt @@ -37,6 +37,9 @@ add_dependencies(lldb if(DEFINED LLDB_PYTHON_DLL_RELATIVE_PATH) target_compile_definitions(lldb PRIVATE LLDB_PYTHON_DLL_RELATIVE_PATH="${LLDB_PYTHON_DLL_RELATIVE_PATH}") endif() +if(DEFINED LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME) + target_compile_definitions(lldb PRIVATE LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME="${LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME}") +endif() if(LLDB_BUILD_FRAMEWORK) # In the build-tree, we know the exact path to the framework directory. diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp index 733331f4ddac0..bebf1a70d50e9 100644 --- a/lldb/tools/driver/Driver.cpp +++ b/lldb/tools/driver/Driver.cpp @@ -433,7 +433,8 @@ SBError Driver::ProcessArgs(const opt::InputArgList &args, bool &exiting) { return error; } -#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH) +#ifdef _WIN32 +#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH /// Returns the full path to the lldb.exe executable. inline std::wstring GetPathToExecutableW() { // Iterate until we reach the Windows API maximum path length (32,767). @@ -447,30 +448,73 @@ inline std::wstring GetPathToExecutableW() { return L""; } -/// Resolve the full path of the directory defined by +/// \brief Resolve the full path of the directory defined by /// LLDB_PYTHON_DLL_RELATIVE_PATH. If it exists, add it to the list of DLL /// search directories. -void AddPythonDLLToSearchPath() { +/// \return `true` if the library was added to the search path. +/// `false` otherwise. +bool AddPythonDLLToSearchPath() { std::wstring modulePath = GetPathToExecutableW(); - if (modulePath.empty()) { - llvm::errs() << "error: unable to find python.dll." << '\n'; - return; - } + if (modulePath.empty()) + return false; SmallVector<char, MAX_PATH> utf8Path; if (sys::windows::UTF16ToUTF8(modulePath.c_str(), modulePath.length(), utf8Path)) - return; + return false; sys::path::remove_filename(utf8Path); sys::path::append(utf8Path, LLDB_PYTHON_DLL_RELATIVE_PATH); sys::fs::make_absolute(utf8Path); SmallVector<wchar_t, 1> widePath; if (sys::windows::widenPath(utf8Path.data(), widePath)) - return; + return false; if (sys::fs::exists(utf8Path)) - SetDllDirectoryW(widePath.data()); + return SetDllDirectoryW(widePath.data()); + return false; +} +#endif + +#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME +/// Returns whether `python3x.dll` is in the DLL search path. +bool IsPythonDLLInPath() { +#define WIDEN2(x) L##x +#define WIDEN(x) WIDEN2(x) + WCHAR foundPath[MAX_PATH]; + DWORD result = + SearchPathW(nullptr, WIDEN(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME), nullptr, + MAX_PATH, foundPath, nullptr); +#undef WIDEN2 +#undef WIDEN + + return result > 0; +} +#endif + +/// Try to setup the DLL search path for the Python Runtime Library +/// (python3xx.dll). +/// +/// If `LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME` is set, we first check if +/// python3xx.dll is in the search path. If it's not, we try to add it and +/// check for it a second time. +/// If only `LLDB_PYTHON_DLL_RELATIVE_PATH` is set, we try to add python3xx.dll +/// to the search path python.dll is already in the search path or not. +void SetupPythonRuntimeLibrary() { +#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME + if (IsPythonDLLInPath()) + return; +#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH + if (AddPythonDLLToSearchPath() && IsPythonDLLInPath()) + return; +#endif + llvm::errs() << "error: unable to find '" + << LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME << "'.\n"; + return; +#elif defined(LLDB_PYTHON_DLL_RELATIVE_PATH) + if (!AddPythonDLLToSearchPath()) + llvm::errs() << "error: unable to find the Python runtime library.\n"; +#endif } #endif @@ -776,8 +820,8 @@ int main(int argc, char const *argv[]) { "~/Library/Logs/DiagnosticReports/.\n"); #endif -#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH) - AddPythonDLLToSearchPath(); +#ifdef _WIN32 + SetupPythonRuntimeLibrary(); #endif // Parse arguments. diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt index dd1bbbdddfc59..fa940b7b73943 100644 --- a/lldb/tools/lldb-dap/CMakeLists.txt +++ b/lldb/tools/lldb-dap/CMakeLists.txt @@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS Support) add_lldb_library(lldbDAP Breakpoint.cpp BreakpointBase.cpp + ClientLauncher.cpp CommandPlugins.cpp DAP.cpp DAPError.cpp diff --git a/lldb/tools/lldb-dap/ClientLauncher.cpp b/lldb/tools/lldb-dap/ClientLauncher.cpp new file mode 100644 index 0000000000000..4cac1d6346441 --- /dev/null +++ b/lldb/tools/lldb-dap/ClientLauncher.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ClientLauncher.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/FormatVariadic.h" + +using namespace lldb_dap; + +std::optional<ClientLauncher::Client> +ClientLauncher::GetClientFrom(llvm::StringRef str) { + return llvm::StringSwitch<std::optional<ClientLauncher::Client>>(str.lower()) + .Case("vscode", ClientLauncher::VSCode) + .Case("vscode-url", ClientLauncher::VSCodeURL) + .Default(std::nullopt); +} + +std::unique_ptr<ClientLauncher> +ClientLauncher::GetLauncher(ClientLauncher::Client client) { + switch (client) { + case ClientLauncher::VSCode: + return std::make_unique<VSCodeLauncher>(); + case ClientLauncher::VSCodeURL: + return std::make_unique<VSCodeURLPrinter>(); + } + return nullptr; +} + +std::string VSCodeLauncher::URLEncode(llvm::StringRef str) { + std::string out; + llvm::raw_string_ostream os(out); + for (char c : str) { + if (std::isalnum(c) || llvm::StringRef("-_.~").contains(c)) + os << c; + else + os << '%' << llvm::utohexstr(c, false, 2); + } + return os.str(); +} + +std::string +VSCodeLauncher::GetLaunchURL(const std::vector<llvm::StringRef> args) const { + assert(!args.empty() && "empty launch args"); + + std::vector<std::string> encoded_launch_args; + for (llvm::StringRef arg : args) + encoded_launch_args.push_back(URLEncode(arg)); + + const std::string args_str = llvm::join(encoded_launch_args, "&args="); + return llvm::formatv( + "vscode://llvm-vs-code-extensions.lldb-dap/start?program={0}", + args_str) + .str(); +} + +llvm::Error VSCodeLauncher::Launch(const std::vector<llvm::StringRef> args) { + const std::string launch_url = GetLaunchURL(args); + const std::string command = + llvm::formatv("code --open-url {0}", launch_url).str(); + + std::system(command.c_str()); + return llvm::Error::success(); +} + +llvm::Error VSCodeURLPrinter::Launch(const std::vector<llvm::StringRef> args) { + llvm::outs() << GetLaunchURL(args) << '\n'; + return llvm::Error::success(); +} diff --git a/lldb/tools/lldb-dap/ClientLauncher.h b/lldb/tools/lldb-dap/ClientLauncher.h new file mode 100644 index 0000000000000..780b178d2d6ef --- /dev/null +++ b/lldb/tools/lldb-dap/ClientLauncher.h @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H +#define LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include <vector> + +namespace lldb_dap { + +class ClientLauncher { +public: + enum Client { + VSCode, + VSCodeURL, + }; + + virtual ~ClientLauncher() = default; + virtual llvm::Error Launch(const std::vector<llvm::StringRef> args) = 0; + + static std::optional<Client> GetClientFrom(llvm::StringRef str); + static std::unique_ptr<ClientLauncher> GetLauncher(Client client); +}; + +class VSCodeLauncher : public ClientLauncher { +public: + using ClientLauncher::ClientLauncher; + + llvm::Error Launch(const std::vector<llvm::StringRef> args) override; + + std::string GetLaunchURL(const std::vector<llvm::StringRef> args) const; + static std::string URLEncode(llvm::StringRef str); +}; + +class VSCodeURLPrinter : public VSCodeLauncher { + using VSCodeLauncher::VSCodeLauncher; + + llvm::Error Launch(const std::vector<llvm::StringRef> args) override; +}; + +} // namespace lldb_dap + +#endif diff --git a/lldb/tools/lldb-dap/tool/Options.td b/lldb/tools/lldb-dap/tool/Options.td index 5e9dd7a1d6419..339a64fed6c32 100644 --- a/lldb/tools/lldb-dap/tool/Options.td +++ b/lldb/tools/lldb-dap/tool/Options.td @@ -82,3 +82,11 @@ def connection_timeout: S<"connection-timeout">, "timeout is reached, the server will be closed and the process will exit. " "Not specifying this argument or specifying non-positive values will " "cause the server to wait for new connections indefinitely.">; + +def client + : S<"client">, + MetaVarName<"<client>">, + HelpText< + "Use lldb-dap as a launcher for a curated number of DAP client.">; + +def REM : R<["--"], "">; diff --git a/lldb/tools/lldb-dap/tool/lldb-dap.cpp b/lldb/tools/lldb-dap/tool/lldb-dap.cpp index 45caa1a81059b..f10ed12344cbd 100644 --- a/lldb/tools/lldb-dap/tool/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/tool/lldb-dap.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "ClientLauncher.h" #include "DAP.h" #include "DAPLog.h" #include "EventHelper.h" @@ -141,6 +142,12 @@ static void PrintHelp(LLDBDAPOptTable &table, llvm::StringRef tool_name) { debugger to attach to the process. lldb-dap -g + + You can also use lldb-dap to launch a supported client, for example the + LLDB-DAP Visual Studio Code extension. + + lldb-dap --client vscode -- /path/to/binary <args> + )___"; } @@ -150,6 +157,29 @@ static void PrintVersion() { llvm::outs() << "liblldb: " << lldb::SBDebugger::GetVersionString() << '\n'; } +static llvm::Error LaunchClient(const llvm::opt::InputArgList &args) { + auto *client_arg = args.getLastArg(OPT_client); + assert(client_arg && "must have client arg"); + + std::optional<ClientLauncher::Client> client = + ClientLauncher::GetClientFrom(client_arg->getValue()); + if (!client) + return llvm::createStringError( + llvm::formatv("unsupported client: {0}", client_arg->getValue())); + + std::vector<llvm::StringRef> launch_args; + if (auto *arg = args.getLastArgNoClaim(OPT_REM)) { + for (auto *value : arg->getValues()) { + launch_args.push_back(value); + } + } + + if (launch_args.empty()) + return llvm::createStringError("no launch arguments provided"); + + return ClientLauncher::GetLauncher(*client)->Launch(launch_args); +} + #if not defined(_WIN32) struct FDGroup { int GetFlags() const { @@ -541,6 +571,14 @@ int main(int argc, char *argv[]) { return EXIT_SUCCESS; } + if (input_args.hasArg(OPT_client)) { + if (llvm::Error error = LaunchClient(input_args)) { + llvm::WithColor::error() << llvm::toString(std::move(error)) << '\n'; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; + } + ReplMode default_repl_mode = ReplMode::Auto; if (input_args.hasArg(OPT_repl_mode)) { llvm::opt::Arg *repl_mode = input_args.getLastArg(OPT_repl_mode); diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt index a08414c30e6cd..b1fdef18fddba 100644 --- a/lldb/unittests/DAP/CMakeLists.txt +++ b/lldb/unittests/DAP/CMakeLists.txt @@ -1,4 +1,5 @@ add_lldb_unittest(DAPTests + ClientLauncherTest.cpp DAPErrorTest.cpp DAPTest.cpp DAPTypesTest.cpp diff --git a/lldb/unittests/DAP/ClientLauncherTest.cpp b/lldb/unittests/DAP/ClientLauncherTest.cpp new file mode 100644 index 0000000000000..dbaf9ee786336 --- /dev/null +++ b/lldb/unittests/DAP/ClientLauncherTest.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ClientLauncher.h" +#include "llvm/ADT/StringRef.h" +#include "gtest/gtest.h" +#include <optional> + +using namespace lldb_dap; +using namespace llvm; + +TEST(ClientLauncherTest, GetClientFromVSCode) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("vscode"); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(ClientLauncher::VSCode, result.value()); +} + +TEST(ClientLauncherTest, GetClientFromVSCodeUpperCase) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("VSCODE"); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(ClientLauncher::VSCode, result.value()); +} + +TEST(ClientLauncherTest, GetClientFromVSCodeMixedCase) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("VSCode"); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(ClientLauncher::VSCode, result.value()); +} + +TEST(ClientLauncherTest, GetClientFromInvalidString) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom("invalid"); + EXPECT_FALSE(result.has_value()); +} + +TEST(ClientLauncherTest, GetClientFromEmptyString) { + std::optional<ClientLauncher::Client> result = + ClientLauncher::GetClientFrom(""); + EXPECT_FALSE(result.has_value()); +} + +TEST(ClientLauncherTest, URLEncode) { + EXPECT_EQ("", VSCodeLauncher::URLEncode("")); + EXPECT_EQ( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.~", + VSCodeLauncher::URLEncode("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRST" + "UVWXYZ0123456789-_.~")); + EXPECT_EQ("hello%20world", VSCodeLauncher::URLEncode("hello world")); + EXPECT_EQ("hello%21%40%23%24", VSCodeLauncher::URLEncode("hello!@#$")); + EXPECT_EQ("%2Fpath%2Fto%2Ffile", VSCodeLauncher::URLEncode("/path/to/file")); + EXPECT_EQ("key%3Dvalue%26key2%3Dvalue2", + VSCodeLauncher::URLEncode("key=value&key2=value2")); + EXPECT_EQ("100%25complete", VSCodeLauncher::URLEncode("100%complete")); + EXPECT_EQ("file_name%20with%20spaces%20%26%20special%21.txt", + VSCodeLauncher::URLEncode("file_name with spaces & special!.txt")); + EXPECT_EQ("%00%01%02", + VSCodeLauncher::URLEncode(llvm::StringRef("\x00\x01\x02", 3))); + EXPECT_EQ("test-file_name.txt~", + VSCodeLauncher::URLEncode("test-file_name.txt~")); + + // UTF-8 encoded characters should be percent-encoded byte by byte. + EXPECT_EQ("%C3%A9", VSCodeLauncher::URLEncode("é")); +} diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index c450ee5a3d72e..f192cd05b5a34 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -1264,10 +1264,10 @@ endif() # Build with _XOPEN_SOURCE on z/OS. if (CMAKE_SYSTEM_NAME MATCHES "OS390") add_compile_definitions(_XOPEN_SOURCE=600) + add_compile_definitions(_XPLATFORM_SOURCE) # Needed e.g. for O_CLOEXEC. add_compile_definitions(_OPEN_SYS) # Needed for process information. add_compile_definitions(_OPEN_SYS_FILE_EXT) # Needed for EBCDIC I/O. add_compile_definitions(_EXT) # Needed for file data. - add_compile_definitions(_UNIX03_THREADS) # Multithreading support. # Need to build LLVM as ASCII application. # This can't be a global setting because other projects may # need to be built in EBCDIC mode. diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake index bfbd9cfd4063f..2a69c5133c56f 100644 --- a/llvm/cmake/modules/CrossCompile.cmake +++ b/llvm/cmake/modules/CrossCompile.cmake @@ -101,6 +101,7 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype) -DLLVM_INCLUDE_BENCHMARKS=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_TABLEGEN_FLAGS="${LLVM_TABLEGEN_FLAGS}" + -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" ${build_type_flags} ${linker_flag} ${external_clang_dir} ${libc_flags} ${ARGN} WORKING_DIRECTORY ${${project_name}_${target_name}_BUILD} diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 7780c0a6dca0a..30b22a4a6d607 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -883,8 +883,9 @@ supported for the ``amdgcn`` target. Buffer Fat Pointer 7 N/A N/A 160 0 Buffer Resource 8 N/A V# 128 0x00000000000000000000000000000000 Buffer Strided Pointer (experimental) 9 *TODO* - *reserved for downstream use* 10 - *reserved for downstream use* 11 + *reserved for future use* 10 + *reserved for future use* 11 + *reserved for downstream use (LLPC)* 12 Streamout Registers 128 N/A GS_REGS ===================================== =============== =========== ================ ======= ============================ diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst index 8e61e01d7d9c3..0e442d657e987 100644 --- a/llvm/docs/CommandGuide/dsymutil.rst +++ b/llvm/docs/CommandGuide/dsymutil.rst @@ -70,6 +70,14 @@ OPTIONS Print this help output. +.. option:: --include-swiftmodules-from-interface + + Whether or not to copy binary swiftmodules built from textual .swiftinterface + files into the dSYM bundle. These typically come only from the SDK (since + textual interfaces require library evolution) and thus are a waste of space to + copy into the bundle. Turn this on if the swiftmodules are different from + those in the SDK. + .. option:: --keep-function-for-static Make a static variable keep the enclosing function even if it would have been diff --git a/llvm/docs/CommandGuide/llvm-dwarfdump.rst b/llvm/docs/CommandGuide/llvm-dwarfdump.rst index 137830259eb64..dfc0431f07826 100644 --- a/llvm/docs/CommandGuide/llvm-dwarfdump.rst +++ b/llvm/docs/CommandGuide/llvm-dwarfdump.rst @@ -134,6 +134,15 @@ OPTIONS Abbreviate the description of type unit entries. +.. option:: -t, --filter-child-tag + + Only dump children whose DWARF tag is one of the specified tags. + Example usage: + + .. code-block:: c + + llvm-dwarfdump -t DW_TAG_structure_type -t DW_TAG_member -c + .. option:: -x, --regex Treat any <name> strings as regular expressions when searching diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 3c089b5a0ba79..b9507a2d054fe 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -2741,6 +2741,11 @@ For example: ``"nooutline"`` This attribute indicates that outlining passes should not modify the function. +``nocreateundeforpoison`` + This attribute indicates that the result of the function (prior to + application of return attributes/metadata) will not be undef or poison if + all arguments are not undef and not poison. Otherwise, it is undefined + behavior. Call Site Attributes ---------------------- diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst index 3f4c3cde9b3aa..f7647c898c1e6 100644 --- a/llvm/docs/MIRLangRef.rst +++ b/llvm/docs/MIRLangRef.rst @@ -86,25 +86,25 @@ Tests are more accessible and future proof when simplified: - Use the ``-simplify-mir`` option with llc. - Machine function attributes often have default values or the test works just - as well with default values. Typical candidates for this are: `alignment:`, - `exposesReturnsTwice`, `legalized`, `regBankSelected`, `selected`. + as well with default values. Typical candidates for this are: ``alignment:``, + ``exposesReturnsTwice``, ``legalized``, ``regBankSelected``, ``selected``. The whole `frameInfo` section is often unnecessary if there is no special - frame usage in the function. `tracksRegLiveness` on the other hand is often + frame usage in the function. ``tracksRegLiveness`` on the other hand is often necessary for some passes that care about block livein lists. -- The (global) `liveins:` list is typically only interesting for early +- The (global) ``liveins:`` list is typically only interesting for early instruction selection passes and can be removed when testing later passes. - The per-block `liveins:` on the other hand are necessary if + The per-block ``liveins:`` on the other hand are necessary if `tracksRegLiveness` is true. -- Branch probability data in block `successors:` lists can be dropped if the +- Branch probability data in block ``successors:`` lists can be dropped if the test doesn't depend on it. Example: - `successors: %bb.1(0x40000000), %bb.2(0x40000000)` can be replaced with - `successors: %bb.1, %bb.2`. + ``successors: %bb.1(0x40000000), %bb.2(0x40000000)`` can be replaced with + ``successors: %bb.1, %bb.2``. - MIR code contains a whole IR module. This is necessary because there are no equivalents in MIR for global variables, references to external functions, - function attributes, metadata, debug info. Instead some MIR data references + function attributes, metadata, debug info. Instead, some MIR data references the IR constructs. You can often remove them if the test doesn't depend on them. @@ -114,16 +114,16 @@ Tests are more accessible and future proof when simplified: dropped: `:: (load 8)` - MIR blocks can reference IR blocks for debug printing, profile information, - or debug locations. Example: `bb.42.myblock` in MIR references the IR block - `myblock`. It is usually possible to drop the `.myblock` reference and simply - use `bb.42`. + or debug locations. Example: ``bb.42.myblock`` in MIR references the IR block + ``myblock``. It is usually possible to drop the ``.myblock`` reference and simply + use ``bb.42``. - If there are no memory operands or blocks referencing the IR, then the IR function can be replaced by a parameterless dummy function like - `define @func() { ret void }`. + ``define @func() { ret void }``. - It is possible to drop the whole IR section of the MIR file if it only - contains dummy functions (see above). The .mir loader will create the + contains dummy functions (see above). The ``.mir`` loader will create the IR functions automatically in this case. .. _limitations: @@ -131,7 +131,7 @@ Tests are more accessible and future proof when simplified: Limitations ----------- -Currently the MIR format has several limitations in terms of which state it +Currently, the MIR format has several limitations in terms of which state it can serialize: - The target-specific state in the target-specific ``MachineFunctionInfo`` @@ -150,7 +150,7 @@ These limitations impose restrictions on what you can test with the MIR format. For now, tests that would like to test some behaviour that depends on the state of temporary or local ``MCSymbol`` operands or the exception handling state in MMI, can't use the MIR format. As well as that, tests that test some behaviour -that depends on the state of the target specific ``MachineFunctionInfo`` or +that depends on the state of the target-specific ``MachineFunctionInfo`` or ``MachineConstantPoolValue`` subclasses can't use the MIR format at the moment. High Level Structure @@ -286,7 +286,7 @@ Example: Successors ^^^^^^^^^^ -The machine basic block's successors have to be specified before any of the +The machine basic block's successors must be specified before any of the instructions: .. code-block:: text @@ -489,13 +489,13 @@ In case this is true, the Machine Operand is printed according to the target. For example: -In AArch64RegisterInfo.td: +In ``AArch64RegisterInfo.td``: .. code-block:: text def sub_32 : SubRegIndex<32>; -If the third operand is an immediate with the value ``15`` (target-dependent +If the third operand is an immediate with the value ``15`` (a target-dependent value), based on the instruction's opcode and the operand's index the operand will be printed as ``%subreg.sub_32``: @@ -503,7 +503,7 @@ will be printed as ``%subreg.sub_32``: %1:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32 -For integers > 64 bits, we use a special machine operand, ``MO_CImmediate``, +For integers larger than 64 bits, we use a special machine operand, ``MO_CImmediate``, which stores the immediate in a ``ConstantInt`` using an ``APInt`` (LLVM's arbitrary-precision integers). @@ -552,7 +552,7 @@ corresponding internal ``llvm::RegState`` representation: * - ``implicit`` - ``RegState::Implicit`` - - Not emitted register (e.g. carry, or temporary result). + - Not emitted register (e.g., carry, or temporary result). * - ``implicit-def`` - ``RegState::ImplicitDefine`` @@ -625,7 +625,7 @@ For a CPI with the index 0 and offset -12: %1:gr64 = MOV64ri %const.0 - 12 -A constant pool entry is bound to a LLVM IR ``Constant`` or a target-specific +A constant pool entry is bound to an LLVM IR ``Constant`` or a target-specific ``MachineConstantPoolValue``. When serializing all the function's constants, the following format is used: @@ -670,12 +670,12 @@ a global value operand named ``G``: $rax = MOV64rm $rip, 1, _, @G, _ -The named global values are represented using an identifier with the '@' prefix. +The named global values are represented using an identifier with the ``@`` prefix. If the identifier doesn't match the regular expression -`[-a-zA-Z$._][-a-zA-Z$._0-9]*`, then this identifier must be quoted. +``[-a-zA-Z$._][-a-zA-Z$._0-9]*``, then this identifier must be quoted. The unnamed global values are represented using an unsigned numeric value with -the '@' prefix, like in the following examples: ``@0``, ``@989``. +the ``@`` prefix, as in the following examples: ``@0``, ``@989``. Target-dependent Index Operands ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -741,7 +741,7 @@ Example: MCSymbol Operands ^^^^^^^^^^^^^^^^^ -A MCSymbol operand holds a pointer to a ``MCSymbol``. For the limitations +An ``MCSymbol`` operand holds a pointer to an ``MCSymbol``. For the limitations of this operand in MIR, see :ref:`limitations <limitations>`. The syntax is: @@ -825,7 +825,7 @@ Comments ^^^^^^^^ Machine operands can have C/C++ style comments, which are annotations enclosed -between ``/*`` and ``*/`` to improve readability of e.g. immediate operands. +between ``/*`` and ``*/`` to improve readability of e.g., immediate operands. In the example below, ARM instructions EOR and BCC and immediate operands ``14`` and ``0`` have been annotated with their condition codes (CC) definitions, i.e. the ``always`` and ``eq`` condition codes: @@ -920,7 +920,7 @@ Instruction referencing locations This experimental feature aims to separate the specification of variable *values* from the program point where a variable takes on that value. Changes -in variable value occur in the same manner as ``DBG_VALUE`` meta instructions +in a variable value occur in the same manner as ``DBG_VALUE`` meta instructions but using ``DBG_INSTR_REF``. Variable values are identified by a pair of instruction number and operand number. Consider the example below: diff --git a/llvm/docs/MergeFunctions.rst b/llvm/docs/MergeFunctions.rst index d43b9c3a89091..d64c846687bae 100644 --- a/llvm/docs/MergeFunctions.rst +++ b/llvm/docs/MergeFunctions.rst @@ -8,9 +8,9 @@ MergeFunctions pass, how it works Introduction ============ Sometimes code contains equal functions, or functions that do exactly the same -thing even though they are non-equal on the IR level (e.g.: multiplication on 2 -and 'shl 1'). This can happen for several reasons: mainly, the usage of -templates and automatic code generators. Though, sometimes the user itself could +thing even though they are non-equal on the IR level (e.g.,: multiplication on 2 +and ``shl 1``). This can happen for several reasons: mainly, the usage of +templates and automatic code generators. However, sometimes the user itself could write the same thing twice :-) The main purpose of this pass is to recognize such functions and merge them. @@ -20,21 +20,21 @@ describes the algorithm used to compare functions and explains how we could combine equal functions correctly to keep the module valid. -Material is brought in a top-down form, so the reader could start to learn pass +The material is presented in a top-down form, so the reader could start to learn pass from high level ideas and end with low-level algorithm details, thus preparing him or her for reading the sources. The main goal is to describe the algorithm and logic here and the concept. If you *don't want* to read the source code, but want to understand pass algorithms, this document is good for you. The author tries not to repeat the -source-code and covers only common cases to avoid the cases of needing to +source code and covers only common cases to avoid the cases of needing to update this document after any minor code changes. What should I know to be able to follow along with this document? ----------------------------------------------------------------- -The reader should be familiar with common compile-engineering principles and +The reader should be familiar with common compiler-engineering principles and LLVM code fundamentals. In this article, we assume the reader is familiar with `Single Static Assignment <http://en.wikipedia.org/wiki/Static_single_assignment_form>`_ @@ -99,7 +99,7 @@ and a ``void*`` as equal. This is just an example; more possible details are described a bit below. As another example, the reader may imagine two more functions. The first -function performs a multiplication by 2, while the second one performs an +function performs a multiplication by 2, while the second one performs a logical left shift by 1. Possible solutions @@ -131,7 +131,7 @@ access lookup? The answer is: "yes". Random-access """"""""""""" How can this be done? Just convert each function to a number, and gather -all of them in a special hash-table. Functions with equal hashes are equal. +all of them in a special hash table. Functions with equal hashes are equal. Good hashing means, that every function part must be taken into account. That means we have to convert every function part into some number, and then add it into the hash. The lookup-up time would be small, but such an approach adds some @@ -175,7 +175,7 @@ merged with each other. It is defined as: ``std::set<FunctionNode> FnTree;`` -Here ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with +Here, ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with an implemented “<” operator among the functions set (below we explain how it works exactly; this is a key point in fast functions comparison). @@ -207,7 +207,7 @@ from method. Comparison and logarithmical search """"""""""""""""""""""""""""""""""" Let's recall our task: for every function *F* from module *M*, we have to find -equal functions *F`* in the shortest time possible , and merge them into a +equal functions *F`* in the shortest time possible and merge them into a single function. Defining total ordering among the functions set allows us to organize @@ -225,7 +225,7 @@ possible values: 1, left is *greater* than right. -Of course it means, that we have to maintain +Of course, it means that we have to maintain *strict and non-strict order relation properties*: * reflexivity (``a <= a``, ``a == a``, ``a >= a``), @@ -235,7 +235,7 @@ Of course it means, that we have to maintain As mentioned before, the comparison routine consists of "sub-comparison-routines", with each of them also consisting of -"sub-comparison-routines", and so on. Finally, it ends up with primitive +"sub-comparison-routines", and so on. Finally, it ends up with a primitive comparison. Below, we will use the following operations: @@ -275,7 +275,7 @@ A brief look at the source code tells us that the comparison starts in the “``int FunctionComparator::compare(void)``” method. 1. The first parts to be compared are the function's attributes and some -properties that is outside the “attributes” term, but still could make the +properties that are outside the “attributes” term, but still could make the function different without changing its body. This part of the comparison is usually done within simple *cmpNumbers* or *cmpFlags* operations (e.g. ``cmpFlags(F1->hasGC(), F2->hasGC())``). Below is a full list of function's @@ -365,7 +365,7 @@ comparing them as numbers. 7. Complex types (structures, arrays, etc.). Follow complex objects comparison technique (see the very first paragraph of this chapter). Both *left* and *right* are to be expanded and their element types will be checked the same -way. If we get -1 or 1 on some stage, return it. Otherwise return 0. +way. If we get -1 or 1 on some stage, return it. Otherwise, return 0. 8. Steps 1-6 describe all the possible cases, if we passed steps 1-6 and didn't get any conclusions, then invoke ``llvm_unreachable``, since it's quite an @@ -445,7 +445,7 @@ How to implement cmpValues? but, in general, we need to implement antisymmetric relation. As mentioned above, to understand what is *less*, we can use order in which we meet values. If both values have the same order in a function (met at the same -time), we then treat values as *associated*. Otherwise – it depends on who was +time), we then treat values as *associated*. Otherwise, it depends on who was first. Every time we run the top-level compare method, we initialize two identical @@ -623,7 +623,7 @@ to use ``accumulateConstantOffset`` method. So, if we get constant offset for both left and right *GEPs*, then compare it as numbers, and return comparison result. -Otherwise treat it like a regular operation (see previous paragraph). +Otherwise, treat it like a regular operation (see previous paragraph). cmpOperation ------------ @@ -742,7 +742,7 @@ We call ``writeThunkOrAlias(Function *F, Function *G)``. Here we try to replace referenced anywhere, * function should come with external, local or weak linkage. -Otherwise we write thunk: some wrapper that has *G's* interface and calls *F*, +Otherwise, we write thunk: some wrapper that has *G's* interface and calls *F*, so *G* could be replaced with this wrapper. *writeAlias* @@ -772,7 +772,7 @@ As it written in method comments: “Replace G with a simple tail call to bitcast(F). Also replace direct uses of G with bitcast(F). Deletes G.” -In general it does the same as usual when we want to replace callee, except the +In general, it does the same as usual when we want to replace callee, except the first point: 1. We generate tail call wrapper around *F*, but with an interface that allows using diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 49184e3104868..d03f383a92b3b 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -406,6 +406,12 @@ The current vendor extensions supported are: ``XSfvcp`` LLVM implements `version 1.1.0 of the SiFive Vector Coprocessor Interface (VCIX) Software Specification <https://sifive.cdn.prismic.io/sifive/Zn3m1R5LeNNTwnLS_vcix-spec-software-v1p1.pdf>`__ by SiFive. All instructions are prefixed with `sf.vc.` as described in the specification, and the riscv-toolchain-convention document linked above. +``Xsfvfexp16e``, ``Xsfvfbfexp16e``, and ``Xsfvfexp32e`` + LLVM implements `version 0.5 of the Vector Exponential Extension Specification <https://www.sifive.com/document-file/exponential-function-instruction-xsfvfexp32e-xsfvf>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above. + +``Xsfvfexpa`` and ``Xsfvfexpa64e`` + LLVM implements `version 0.2 of the Vector Exponential Approximation Extension Specification <https://www.sifive.com/document-file/exponential-approximation-instruction-xsfvfexpa-ex>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above. + ``XSfvqmaccdod``, ``XSfvqmaccqoq`` LLVM implements `version 1.1.0 of the SiFive Int8 Matrix Multiplication Extensions Specification <https://sifive.cdn.prismic.io/sifive/1a2ad85b-d818-49f7-ba83-f51f1731edbe_int8-matmul-spec.pdf>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above. diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst index 85eeabf10244a..749961356d23e 100644 --- a/llvm/docs/SPIRVUsage.rst +++ b/llvm/docs/SPIRVUsage.rst @@ -173,6 +173,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e - Allows generating arbitrary width integer types. * - ``SPV_INTEL_bindless_images`` - Adds instructions to convert convert unsigned integer handles to images, samplers and sampled images. + * - ``SPV_INTEL_bfloat16_arithmetic`` + - Allows the use of 16-bit bfloat16 values in arithmetic and relational operators. * - ``SPV_INTEL_bfloat16_conversion`` - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values. * - ``SPV_INTEL_cache_controls`` @@ -187,6 +189,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e - Adds decorations that can be applied to global (module scope) variables. * - ``SPV_INTEL_global_variable_fpga_decorations`` - Adds decorations that can be applied to global (module scope) variables to help code generation for FPGA devices. + * - ``SPV_INTEL_kernel_attributes`` + - Adds execution modes that can be applied to entry points to inform scheduling. * - ``SPV_INTEL_media_block_io`` - Adds additional subgroup block read and write functionality that allow applications to flexibly specify the width and height of the block to read from or write to a 2D image. * - ``SPV_INTEL_memory_access_aliasing`` @@ -226,9 +230,9 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e * - ``SPV_INTEL_fp_max_error`` - Adds the ability to specify the maximum error for floating-point operations. * - ``SPV_INTEL_ternary_bitwise_function`` - - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform. + - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform. * - ``SPV_INTEL_subgroup_matrix_multiply_accumulate`` - - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix. + - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix. * - ``SPV_INTEL_int4`` - Adds support for 4-bit integer type, and allow this type to be used in cooperative matrices. * - ``SPV_KHR_float_controls2`` diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h index bccdb8930561e..82ac9a3a1ef80 100644 --- a/llvm/include/llvm/ADT/APFloat.h +++ b/llvm/include/llvm/ADT/APFloat.h @@ -152,7 +152,7 @@ class APFloatBase { static constexpr unsigned integerPartWidth = APInt::APINT_BITS_PER_WORD; /// A signed type to represent a floating point numbers unbiased exponent. - typedef int32_t ExponentType; + using ExponentType = int32_t; /// \name Floating Point Semantics. /// @{ @@ -938,8 +938,8 @@ LLVM_ABI DoubleAPFloat frexp(const DoubleAPFloat &X, int &Exp, roundingMode); // This is a interface class that is currently forwarding functionalities from // detail::IEEEFloat. class APFloat : public APFloatBase { - typedef detail::IEEEFloat IEEEFloat; - typedef detail::DoubleAPFloat DoubleAPFloat; + using IEEEFloat = detail::IEEEFloat; + using DoubleAPFloat = detail::DoubleAPFloat; static_assert(std::is_standard_layout<IEEEFloat>::value); diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h index 9fa98ad4ddde1..fdb3b84b73a1f 100644 --- a/llvm/include/llvm/ADT/APInt.h +++ b/llvm/include/llvm/ADT/APInt.h @@ -77,7 +77,7 @@ inline APInt operator-(APInt); /// class [[nodiscard]] APInt { public: - typedef uint64_t WordType; + using WordType = uint64_t; /// Byte size of a word. static constexpr unsigned APINT_WORD_SIZE = sizeof(WordType); @@ -154,6 +154,7 @@ class [[nodiscard]] APInt { /// Once all uses of this constructor are migrated to other constructors, /// consider marking this overload ""= delete" to prevent calls from being /// incorrectly bound to the APInt(unsigned, uint64_t, bool) constructor. + [[deprecated("Use other constructors of APInt")]] LLVM_ABI APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[]); /// Construct an APInt from a string representation. diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h index 9e81a4b735e7f..cc3f3a9226395 100644 --- a/llvm/include/llvm/ADT/BitVector.h +++ b/llvm/include/llvm/ADT/BitVector.h @@ -99,7 +99,7 @@ template <typename BitVectorT> class const_set_bits_iterator_impl { }; class BitVector { - typedef uintptr_t BitWord; + using BitWord = uintptr_t; enum { BITWORD_SIZE = (unsigned)sizeof(BitWord) * CHAR_BIT }; @@ -147,8 +147,8 @@ class BitVector { } }; - typedef const_set_bits_iterator_impl<BitVector> const_set_bits_iterator; - typedef const_set_bits_iterator set_iterator; + using const_set_bits_iterator = const_set_bits_iterator_impl<BitVector>; + using set_iterator = const_set_bits_iterator; const_set_bits_iterator set_bits_begin() const { return const_set_bits_iterator(*this); diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h index e9f99bafe9f1e..426a083778d6e 100644 --- a/llvm/include/llvm/ADT/GenericSSAContext.h +++ b/llvm/include/llvm/ADT/GenericSSAContext.h @@ -25,7 +25,7 @@ template <typename, bool> class DominatorTreeBase; template <typename> class SmallVectorImpl; namespace Intrinsic { -typedef unsigned ID; +using ID = unsigned; } // Specializations of this template should provide the types used by the diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h index a9841c6651b72..af0e4a36be1b1 100644 --- a/llvm/include/llvm/ADT/STLExtras.h +++ b/llvm/include/llvm/ADT/STLExtras.h @@ -1516,8 +1516,8 @@ template <class Iterator, class RNG> void shuffle(Iterator first, Iterator last, RNG &&g) { // It would be better to use a std::uniform_int_distribution, // but that would be stdlib dependent. - typedef - typename std::iterator_traits<Iterator>::difference_type difference_type; + using difference_type = + typename std::iterator_traits<Iterator>::difference_type; for (auto size = last - first; size > 1; ++first, (void)--size) { difference_type offset = g() % size; // Avoid self-assignment due to incorrect assertions in libstdc++ @@ -2600,16 +2600,6 @@ bool hasNItemsOrLess(ContainerTy &&C, unsigned N) { return hasNItemsOrLess(adl_begin(C), adl_end(C), N); } -/// Returns a raw pointer that represents the same address as the argument. -/// -/// This implementation can be removed once we move to C++20 where it's defined -/// as std::to_address(). -/// -/// The std::pointer_traits<>::to_address(p) variations of these overloads has -/// not been implemented. -template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); } -template <class T> constexpr T *to_address(T *P) { return P; } - // Detect incomplete types, relying on the fact that their size is unknown. namespace detail { template <typename T> using has_sizeof = decltype(sizeof(T)); diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h index e02694f043fbb..ad94cdede9288 100644 --- a/llvm/include/llvm/ADT/STLForwardCompat.h +++ b/llvm/include/llvm/ADT/STLForwardCompat.h @@ -134,6 +134,16 @@ struct identity // NOLINT(readability-identifier-naming) } }; +/// Returns a raw pointer that represents the same address as the argument. +/// +/// This implementation can be removed once we move to C++20 where it's defined +/// as std::to_address(). +/// +/// The std::pointer_traits<>::to_address(p) variations of these overloads has +/// not been implemented. +template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); } +template <class T> constexpr T *to_address(T *P) { return P; } + //===----------------------------------------------------------------------===// // Features from C++23 //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h index 8c8d31bd4f055..53ebec1eb3a54 100644 --- a/llvm/include/llvm/ADT/StringSwitch.h +++ b/llvm/include/llvm/ADT/StringSwitch.h @@ -14,7 +14,6 @@ #define LLVM_ADT_STRINGSWITCH_H #include "llvm/ADT/StringRef.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include <cassert> #include <cstring> @@ -42,6 +41,8 @@ namespace llvm { /// .Cases({"violet", "purple"}, Violet) /// .Default(UnknownColor); /// \endcode +/// +/// When multiple matches are found, the value of the first match is returned. template<typename T, typename R = T> class StringSwitch { /// The string we are matching. @@ -64,7 +65,7 @@ class StringSwitch { void operator=(const StringSwitch &) = delete; void operator=(StringSwitch &&) = delete; - // Case-sensitive case matchers + // Case-sensitive case matchers. StringSwitch &Case(StringLiteral S, T Value) { CaseImpl(S, Value); return *this; @@ -214,23 +215,30 @@ class StringSwitch { [[nodiscard]] operator R() { return DefaultUnreachable(); } private: - // Returns true when `Str` matches the `S` argument, and stores the result. + // Returns true when a match is found. If `Str` matches the `S` argument, + // stores the result. bool CaseImpl(StringLiteral S, T &Value) { - if (!Result && Str == S) { - Result = std::move(Value); + if (Result) return true; - } - return false; + + if (Str != S) + return false; + + Result = std::move(Value); + return true; } - // Returns true when `Str` matches the `S` argument (case-insensitive), and - // stores the result. + // Returns true when a match is found. If `Str` matches the `S` argument + // (case-insensitive), stores the result. bool CaseLowerImpl(StringLiteral S, T &Value) { - if (!Result && Str.equals_insensitive(S)) { - Result = std::move(Value); + if (Result) return true; - } - return false; + + if (!Str.equals_insensitive(S)) + return false; + + Result = std::move(Value); + return true; } StringSwitch &CasesImpl(std::initializer_list<StringLiteral> Cases, diff --git a/llvm/include/llvm/ADT/ilist.h b/llvm/include/llvm/ADT/ilist.h index aed19ccbff7f2..64392903bec74 100644 --- a/llvm/include/llvm/ADT/ilist.h +++ b/llvm/include/llvm/ADT/ilist.h @@ -108,21 +108,21 @@ template <typename Ty> struct ilist_traits<const Ty> {}; /// list. template <class IntrusiveListT, class TraitsT> class iplist_impl : public TraitsT, IntrusiveListT { - typedef IntrusiveListT base_list_type; + using base_list_type = IntrusiveListT; public: - typedef typename base_list_type::pointer pointer; - typedef typename base_list_type::const_pointer const_pointer; - typedef typename base_list_type::reference reference; - typedef typename base_list_type::const_reference const_reference; - typedef typename base_list_type::value_type value_type; - typedef typename base_list_type::size_type size_type; - typedef typename base_list_type::difference_type difference_type; - typedef typename base_list_type::iterator iterator; - typedef typename base_list_type::const_iterator const_iterator; - typedef typename base_list_type::reverse_iterator reverse_iterator; - typedef - typename base_list_type::const_reverse_iterator const_reverse_iterator; + using pointer = typename base_list_type::pointer; + using const_pointer = typename base_list_type::const_pointer; + using reference = typename base_list_type::reference; + using const_reference = typename base_list_type::const_reference; + using value_type = typename base_list_type::value_type; + using size_type = typename base_list_type::size_type; + using difference_type = typename base_list_type::difference_type; + using iterator = typename base_list_type::iterator; + using const_iterator = typename base_list_type::const_iterator; + using reverse_iterator = typename base_list_type::reverse_iterator; + using const_reverse_iterator = + typename base_list_type::const_reverse_iterator; private: static bool op_less(const_reference L, const_reference R) { return L < R; } diff --git a/llvm/include/llvm/ADT/ilist_node_options.h b/llvm/include/llvm/ADT/ilist_node_options.h index 003d5dabce897..53719b07a3768 100644 --- a/llvm/include/llvm/ADT/ilist_node_options.h +++ b/llvm/include/llvm/ADT/ilist_node_options.h @@ -58,8 +58,8 @@ namespace ilist_detail { template <bool IsExplicit> struct explicitness { static const bool is_explicit = IsExplicit; }; -typedef explicitness<true> is_explicit; -typedef explicitness<false> is_implicit; +using is_explicit = explicitness<true>; +using is_implicit = explicitness<false>; /// Check whether an option is valid. /// @@ -103,12 +103,12 @@ struct is_valid_option<ilist_sentinel_tracking<EnableSentinelTracking>> template <class... Options> struct extract_tag; template <class Tag, class... Options> struct extract_tag<ilist_tag<Tag>, Options...> { - typedef Tag type; + using type = Tag; }; template <class Option1, class... Options> struct extract_tag<Option1, Options...> : extract_tag<Options...> {}; template <> struct extract_tag<> { - typedef void type; + using type = void; }; template <class Tag> struct is_valid_option<ilist_tag<Tag>> : std::true_type {}; @@ -134,11 +134,13 @@ struct is_valid_option<ilist_iterator_bits<IteratorBits>> : std::true_type {}; template <class... Options> struct extract_parent; template <class ParentTy, class... Options> struct extract_parent<ilist_parent<ParentTy>, Options...> { - typedef ParentTy type; + using type = ParentTy; }; template <class Option1, class... Options> struct extract_parent<Option1, Options...> : extract_parent<Options...> {}; -template <> struct extract_parent<> { typedef void type; }; +template <> struct extract_parent<> { + using type = void; +}; template <class ParentTy> struct is_valid_option<ilist_parent<ParentTy>> : std::true_type {}; @@ -154,28 +156,27 @@ struct check_options : std::conjunction<is_valid_option<Options>...> {}; template <class T, bool EnableSentinelTracking, bool IsSentinelTrackingExplicit, class TagT, bool HasIteratorBits, class ParentTy> struct node_options { - typedef T value_type; - typedef T *pointer; - typedef T &reference; - typedef const T *const_pointer; - typedef const T &const_reference; + using value_type = T; + using pointer = T *; + using reference = T &; + using const_pointer = const T *; + using const_reference = const T &; static const bool enable_sentinel_tracking = EnableSentinelTracking; static const bool is_sentinel_tracking_explicit = IsSentinelTrackingExplicit; static const bool has_iterator_bits = HasIteratorBits; - typedef TagT tag; - typedef ParentTy parent_ty; - typedef ilist_node_base<enable_sentinel_tracking, parent_ty> node_base_type; - typedef ilist_base<enable_sentinel_tracking, parent_ty> list_base_type; + using tag = TagT; + using parent_ty = ParentTy; + using node_base_type = ilist_node_base<enable_sentinel_tracking, parent_ty>; + using list_base_type = ilist_base<enable_sentinel_tracking, parent_ty>; }; template <class T, class... Options> struct compute_node_options { - typedef node_options<T, extract_sentinel_tracking<Options...>::value, - extract_sentinel_tracking<Options...>::is_explicit, - typename extract_tag<Options...>::type, - extract_iterator_bits<Options...>::value, - typename extract_parent<Options...>::type> - type; + using type = node_options<T, extract_sentinel_tracking<Options...>::value, + extract_sentinel_tracking<Options...>::is_explicit, + typename extract_tag<Options...>::type, + extract_iterator_bits<Options...>::value, + typename extract_parent<Options...>::type>; }; } // end namespace ilist_detail diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 84b4ad7c1d5a9..c85ef3e131068 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -893,7 +893,7 @@ replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, /// result of this function is undefined. LLVM_ABI std::optional<int64_t> getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, - const Loop *Lp, + const Loop *Lp, const DominatorTree &DT, const DenseMap<Value *, const SCEV *> &StridesMap = DenseMap<Value *, const SCEV *>(), bool Assume = false, bool ShouldCheckWrap = true); diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index af218ba564081..093309cb8bbee 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -1024,6 +1024,16 @@ findValuesAffectedByCondition(Value *Cond, bool IsAssume, LLVM_ABI Value *stripNullTest(Value *V); LLVM_ABI const Value *stripNullTest(const Value *V); +/// Enumerates all possible values of V and inserts them into the set \p +/// Constants. If \p AllowUndefOrPoison is false, it fails when V may contain +/// undef/poison elements. Returns true if the result is complete. Otherwise, +/// the result is incomplete (more than MaxCount values). +/// NOTE: The constant values are not distinct. +LLVM_ABI bool +collectPossibleValues(const Value *V, + SmallPtrSetImpl<const Constant *> &Constants, + unsigned MaxCount, bool AllowUndefOrPoison = true); + } // end namespace llvm #endif // LLVM_ANALYSIS_VALUETRACKING_H diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 464f475098ec5..b0c5beae631ce 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -801,6 +801,7 @@ enum AttributeKindCodes { ATTR_KIND_CAPTURES = 102, ATTR_KIND_DEAD_ON_RETURN = 103, ATTR_KIND_SANITIZE_ALLOC_TOKEN = 104, + ATTR_KIND_NO_CREATE_UNDEF_OR_POISON = 105, }; enum ComdatSelectionKindCodes { diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h index 69ee4dde1974a..7f5b11223c54d 100644 --- a/llvm/include/llvm/CAS/ActionCache.h +++ b/llvm/include/llvm/CAS/ActionCache.h @@ -75,6 +75,9 @@ class ActionCache { CanBeDistributed); } + /// Validate the ActionCache contents. + virtual Error validate() const = 0; + virtual ~ActionCache() = default; protected: @@ -97,6 +100,9 @@ class ActionCache { /// Create an action cache in memory. std::unique_ptr<ActionCache> createInMemoryActionCache(); +/// Create an action cache on disk. +Expected<std::unique_ptr<ActionCache>> createOnDiskActionCache(StringRef Path); + } // end namespace llvm::cas #endif // LLVM_CAS_ACTIONCACHE_H diff --git a/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h new file mode 100644 index 0000000000000..6c165c421b168 --- /dev/null +++ b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H +#define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H + +#include "llvm/Support/Error.h" + +namespace llvm::cas { + +class ActionCache; +class ObjectStore; + +/// Create on-disk \c ObjectStore and \c ActionCache instances based on +/// \c ondisk::UnifiedOnDiskCache, with built-in hashing. +Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>> +createOnDiskUnifiedCASDatabases(StringRef Path); + +/// Represents the result of validating the contents using +/// \c validateOnDiskUnifiedCASDatabasesIfNeeded. +/// +/// Note: invalid results are handled as an \c Error. +enum class ValidationResult { + /// The data is already valid. + Valid, + /// The data was invalid, but was recovered. + Recovered, + /// Validation was skipped, as it was not needed. + Skipped, +}; + +/// Validate the data in \p Path, if needed to ensure correctness. +/// +/// \param Path directory for the on-disk database. +/// \param CheckHash Whether to validate hashes match the data. +/// \param AllowRecovery Whether to automatically recover from invalid data by +/// marking the files for garbage collection. +/// \param ForceValidation Whether to force validation to occur even if it +/// should not be necessary. +/// \param LLVMCasBinaryPath If provided, validation is performed out-of-process +/// using the given \c llvm-cas executable which protects against crashes +/// during validation. Otherwise validation is performed in-process. +/// +/// \returns \c Valid if the data is already valid, \c Recovered if data +/// was invalid but has been cleared, \c Skipped if validation is not needed, +/// or an \c Error if validation cannot be performed or if the data is left +/// in an invalid state because \p AllowRecovery is false. +Expected<ValidationResult> validateOnDiskUnifiedCASDatabasesIfNeeded( + StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinaryPath); + +} // namespace llvm::cas + +#endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h index 6db5dd3904095..29950fe9d9029 100644 --- a/llvm/include/llvm/CAS/ObjectStore.h +++ b/llvm/include/llvm/CAS/ObjectStore.h @@ -5,6 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the declaration of the ObjectStore class. +/// +//===----------------------------------------------------------------------===// #ifndef LLVM_CAS_OBJECTSTORE_H #define LLVM_CAS_OBJECTSTORE_H @@ -111,7 +116,10 @@ class ObjectStore { virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0; /// Validate the underlying object referred by CASID. - virtual Error validate(const CASID &ID) = 0; + virtual Error validateObject(const CASID &ID) = 0; + + /// Validate the entire ObjectStore. + virtual Error validate(bool CheckHash) const = 0; protected: /// Load the object referenced by \p Ref. @@ -215,9 +223,39 @@ class ObjectStore { return Data.size(); } + /// Set the size for limiting growth of on-disk storage. This has an effect + /// for when the instance is closed. + /// + /// Implementations may leave this unimplemented. + virtual Error setSizeLimit(std::optional<uint64_t> SizeLimit) { + return Error::success(); + } + + /// \returns the storage size of the on-disk CAS data. + /// + /// Implementations that don't have an implementation for this should return + /// \p std::nullopt. + virtual Expected<std::optional<uint64_t>> getStorageSize() const { + return std::nullopt; + } + + /// Prune local storage to reduce its size according to the desired size + /// limit. Pruning can happen concurrently with other operations. + /// + /// Implementations may leave this unimplemented. + virtual Error pruneStorageData() { return Error::success(); } + /// Validate the whole node tree. Error validateTree(ObjectRef Ref); + /// Import object from another CAS. This will import the full tree from the + /// other CAS. + Expected<ObjectRef> importObject(ObjectStore &Upstream, ObjectRef Other); + + /// Print the ObjectStore internals for debugging purpose. + virtual void print(raw_ostream &) const {} + void dump() const; + /// Get CASContext const CASContext &getContext() const { return Context; } @@ -290,8 +328,15 @@ class ObjectProxy { ObjectHandle H; }; +/// Create an in memory CAS. std::unique_ptr<ObjectStore> createInMemoryCAS(); +/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled. +bool isOnDiskCASEnabled(); + +/// Create a persistent on-disk path at \p Path. +Expected<std::unique_ptr<ObjectStore>> createOnDiskCAS(const Twine &Path); + } // namespace cas } // namespace llvm diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h index 5f0ee0e131c0f..76cc528711b69 100644 --- a/llvm/include/llvm/CAS/OnDiskGraphDB.h +++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h @@ -340,13 +340,16 @@ class OnDiskGraphDB { /// \param HashByteSize Size for the object digest hash bytes. /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes /// if they don't exist in the primary store. The upstream store is only used - /// for reading nodes, new nodes are only written to the primary store. + /// for reading nodes, new nodes are only written to the primary store. User + /// need to make sure \p UpstreamDB outlives current instance of + /// OnDiskGraphDB and the common usage is to have an \p UnifiedOnDiskCache to + /// manage both. /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied /// to primary store. This is recorded at creation time and subsequent opens /// need to pass the same policy otherwise the \p open will fail. static Expected<std::unique_ptr<OnDiskGraphDB>> open(StringRef Path, StringRef HashName, unsigned HashByteSize, - std::unique_ptr<OnDiskGraphDB> UpstreamDB = nullptr, + OnDiskGraphDB *UpstreamDB = nullptr, FaultInPolicy Policy = FaultInPolicy::FullTree); ~OnDiskGraphDB(); @@ -438,8 +441,7 @@ class OnDiskGraphDB { // Private constructor. OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index, - OnDiskDataAllocator DataPool, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, + OnDiskDataAllocator DataPool, OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy); /// Mapping from hash to object reference. @@ -459,7 +461,7 @@ class OnDiskGraphDB { std::string RootPath; /// Optional on-disk store to be used for faulting-in nodes. - std::unique_ptr<OnDiskGraphDB> UpstreamDB; + OnDiskGraphDB *UpstreamDB = nullptr; /// The policy used to fault in data from upstream. FaultInPolicy FIPolicy; diff --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h index b762518366c21..17ae52f0307fc 100644 --- a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h +++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h @@ -19,6 +19,8 @@ namespace llvm::cas::ondisk { +class UnifiedOnDiskCache; + /// An on-disk key-value data store with the following properties: /// * Keys are fixed length binary hashes with expected normal distribution. /// * Values are buffers of the same size, specified at creation time. @@ -59,9 +61,13 @@ class OnDiskKeyValueDB { /// \param KeySize Size for the key hash bytes. /// \param ValueName Identifier name for the values. /// \param ValueSize Size for the value bytes. + /// \param UnifiedCache An optional UnifiedOnDiskCache that manages the size + /// and lifetime of the CAS instance and it must owns current initializing + /// KeyValueDB after initialized. static Expected<std::unique_ptr<OnDiskKeyValueDB>> open(StringRef Path, StringRef HashName, unsigned KeySize, - StringRef ValueName, size_t ValueSize); + StringRef ValueName, size_t ValueSize, + UnifiedOnDiskCache *UnifiedCache = nullptr); using CheckValueT = function_ref<Error(FileOffset Offset, ArrayRef<char> Data)>; @@ -70,11 +76,14 @@ class OnDiskKeyValueDB { Error validate(CheckValueT CheckValue) const; private: - OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache) - : ValueSize(ValueSize), Cache(std::move(Cache)) {} + OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache, + UnifiedOnDiskCache *UnifiedCache) + : ValueSize(ValueSize), Cache(std::move(Cache)), + UnifiedCache(UnifiedCache) {} const size_t ValueSize; OnDiskTrieRawHashMap Cache; + UnifiedOnDiskCache *UnifiedCache = nullptr; }; } // namespace llvm::cas::ondisk diff --git a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h new file mode 100644 index 0000000000000..6e0878a65fe72 --- /dev/null +++ b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h @@ -0,0 +1,172 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H +#define LLVM_CAS_UNIFIEDONDISKCACHE_H + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include <atomic> + +namespace llvm::cas::ondisk { + +class OnDiskKeyValueDB; + +/// A unified CAS nodes and key-value database, using on-disk storage for both. +/// It manages storage growth and provides APIs for garbage collection. +/// +/// High-level properties: +/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the +/// storage size in that directory will keep growing unrestricted. For data to +/// become eligible for garbage-collection there should be no open instances +/// of \p UnifiedOnDiskCache for that directory, by any process. +/// * Garbage-collection needs to be triggered explicitly by the client. It can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers, in the same process or other +/// processes. +/// +/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open +/// for a limited period of time, e.g. for the duration of a build operation. +/// For long-living processes that need periodic access to a +/// \p UnifiedOnDiskCache, the client should devise a scheme where access is +/// performed within some defined period. For example, if a service is designed +/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it +/// could keep the instance alive while new requests are coming in but close it +/// after a time period in which there are no new requests. +class UnifiedOnDiskCache { +public: + /// The \p OnDiskGraphDB instance for the open directory. + OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; } + + /// The \p OnDiskGraphDB instance for the open directory. + OnDiskKeyValueDB &getKeyValueDB() { return *PrimaryKVDB; } + + /// Open a \p UnifiedOnDiskCache instance for a directory. + /// + /// \param Path directory for the on-disk database. The directory will be + /// created if it doesn't exist. + /// \param SizeLimit Optional size for limiting growth. This has an effect for + /// when the instance is closed. + /// \param HashName Identifier name for the hashing algorithm that is going to + /// be used. + /// \param HashByteSize Size for the object digest hash bytes. + /// \param FaultInPolicy Controls how nodes are copied to primary store. This + /// is recorded at creation time and subsequent opens need to pass the same + /// policy otherwise the \p open will fail. + static Expected<std::unique_ptr<UnifiedOnDiskCache>> + open(StringRef Path, std::optional<uint64_t> SizeLimit, StringRef HashName, + unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy = + OnDiskGraphDB::FaultInPolicy::FullTree); + + /// Validate the data in \p Path, if needed to ensure correctness. + /// + /// Note: if invalid data is detected and \p AllowRecovery is true, then + /// recovery requires exclusive access to the CAS and it is an error to + /// attempt recovery if there is concurrent use of the CAS. + /// + /// \param Path directory for the on-disk database. + /// \param HashName Identifier name for the hashing algorithm that is going to + /// be used. + /// \param HashByteSize Size for the object digest hash bytes. + /// \param CheckHash Whether to validate hashes match the data. + /// \param AllowRecovery Whether to automatically recover from invalid data by + /// marking the files for garbage collection. + /// \param ForceValidation Whether to force validation to occur even if it + /// should not be necessary. + /// \param LLVMCasBinary If provided, validation is performed out-of-process + /// using the given \c llvm-cas executable which protects against crashes + /// during validation. Otherwise validation is performed in-process. + /// + /// \returns \c Valid if the data is already valid, \c Recovered if data + /// was invalid but has been cleared, \c Skipped if validation is not needed, + /// or an \c Error if validation cannot be performed or if the data is left + /// in an invalid state because \p AllowRecovery is false. + static Expected<ValidationResult> + validateIfNeeded(StringRef Path, StringRef HashName, unsigned HashByteSize, + bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinary); + + /// This is called implicitly at destruction time, so it is not required for a + /// client to call this. After calling \p close the only method that is valid + /// to call is \p needsGarbageCollection. + /// + /// \param CheckSizeLimit if true it will check whether the primary store has + /// exceeded its intended size limit. If false the check is skipped even if a + /// \p SizeLimit was passed to the \p open call. + Error close(bool CheckSizeLimit = true); + + /// Set the size for limiting growth. This has an effect for when the instance + /// is closed. + void setSizeLimit(std::optional<uint64_t> SizeLimit); + + /// \returns the storage size of the cache data. + uint64_t getStorageSize() const; + + /// \returns whether the primary store has exceeded the intended size limit. + /// This can return false even if the overall size of the opened directory is + /// over the \p SizeLimit passed to \p open. To know whether garbage + /// collection needs to be triggered or not, call \p needsGarbaseCollection. + bool hasExceededSizeLimit() const; + + /// \returns whether there are unused data that can be deleted using a + /// \p collectGarbage call. + bool needsGarbageCollection() const { return NeedsGarbageCollection; } + + /// Remove any unused data from the directory at \p Path. If there are no such + /// data the operation is a no-op. + /// + /// This can be called concurrently, regardless of whether there is an open + /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers + /// in the same process or other processes. + /// + /// It is recommended that garbage-collection is triggered concurrently in the + /// background, so that it has minimal effect on the workload of the process. + static Error collectGarbage(StringRef Path); + + /// Remove unused data from the current UnifiedOnDiskCache. + Error collectGarbage(); + + /// Helper function to convert the value stored in KeyValueDB and ObjectID. + static ObjectID getObjectIDFromValue(ArrayRef<char> Value); + + using ValueBytes = std::array<char, sizeof(uint64_t)>; + static ValueBytes getValueFromObjectID(ObjectID ID); + + ~UnifiedOnDiskCache(); + +private: + friend class OnDiskGraphDB; + friend class OnDiskKeyValueDB; + + UnifiedOnDiskCache(); + + Expected<std::optional<ArrayRef<char>>> + faultInFromUpstreamKV(ArrayRef<uint8_t> Key); + + /// \returns the storage size of the primary directory. + uint64_t getPrimaryStorageSize() const; + + std::string RootPath; + std::atomic<uint64_t> SizeLimit; + + int LockFD = -1; + + std::atomic<bool> NeedsGarbageCollection; + std::string PrimaryDBDir; + + std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB; + std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB; + + std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB; + std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB; +}; + +} // namespace llvm::cas::ondisk + +#endif // LLVM_CAS_UNIFIEDONDISKCACHE_H diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index e8dbc964a943e..221d8f1e2f673 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -302,7 +302,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { /// (e.g. scalarization). std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost( const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind, - RTLIB::Libcall LC, std::optional<unsigned> CallRetElementIndex = {}) const { Type *RetTy = ICA.getReturnType(); // Vector variants of the intrinsic can be mapped to a vector library call. @@ -311,12 +310,38 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { !isVectorizedStructTy(cast<StructType>(RetTy))) return std::nullopt; + Type *Ty = getContainedTypes(RetTy).front(); + EVT VT = getTLI()->getValueType(DL, Ty); + + EVT ScalarVT = VT.getScalarType(); + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + + switch (ICA.getID()) { + case Intrinsic::modf: + LC = RTLIB::getMODF(ScalarVT); + break; + case Intrinsic::sincospi: + LC = RTLIB::getSINCOSPI(ScalarVT); + break; + case Intrinsic::sincos: + LC = RTLIB::getSINCOS(ScalarVT); + break; + default: + return std::nullopt; + } + // Find associated libcall. - const char *LCName = getTLI()->getLibcallName(LC); - if (!LCName) + RTLIB::LibcallImpl LibcallImpl = getTLI()->getLibcallImpl(LC); + if (LibcallImpl == RTLIB::Unsupported) return std::nullopt; + StringRef LCName = + RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LibcallImpl); + // Search for a corresponding vector variant. + // + // FIXME: Should use RuntimeLibcallsInfo, not TargetLibraryInfo to get the + // vector mapping. LLVMContext &Ctx = RetTy->getContext(); ElementCount VF = getVectorizedTypeVF(RetTy); VecDesc const *VD = nullptr; @@ -2137,22 +2162,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { case Intrinsic::modf: case Intrinsic::sincos: case Intrinsic::sincospi: { - Type *Ty = getContainedTypes(RetTy).front(); - EVT VT = getTLI()->getValueType(DL, Ty); - - RTLIB::Libcall LC = [&] { - switch (ICA.getID()) { - case Intrinsic::modf: - return RTLIB::getMODF; - case Intrinsic::sincos: - return RTLIB::getSINCOS; - case Intrinsic::sincospi: - return RTLIB::getSINCOSPI; - default: - llvm_unreachable("unexpected intrinsic"); - } - }()(VT.getScalarType()); - std::optional<unsigned> CallRetElementIndex; // The first element of the modf result is returned by value in the // libcall. @@ -2160,7 +2169,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { CallRetElementIndex = 0; if (auto Cost = getMultipleResultIntrinsicVectorLibCallCost( - ICA, CostKind, LC, CallRetElementIndex)) + ICA, CostKind, CallRetElementIndex)) return *Cost; // Otherwise, fallback to default scalarization cost. break; diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h index 6c5c27c9662e4..7b965d400ed08 100644 --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -1038,7 +1038,7 @@ class SchedBoundary { getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, unsigned ReleaseAtCycle, unsigned AcquireAtCycle); - bool isUnbufferedGroup(unsigned PIdx) const { + bool isReservedGroup(unsigned PIdx) const { return SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin && !SchedModel->getProcResource(PIdx)->BufferSize; } diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 1a5ffb38f2568..0dd4f23c6d85f 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1260,9 +1260,15 @@ class SelectionDAG { /// stack arguments from being clobbered. LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain); - std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl, - SDValue Dst, SDValue Src, SDValue Size, - const CallInst *CI); + /// Lower a memcmp operation into a target library call and return the + /// resulting chain and call result as SelectionDAG SDValues. + LLVM_ABI std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl, + SDValue Dst, SDValue Src, + SDValue Size, + const CallInst *CI); + + /// Lower a strlen operation into a target library call and return the + /// resulting chain and call result as SelectionDAG SDValues. LLVM_ABI std::pair<SDValue, SDValue> getStrlen(SDValue Chain, const SDLoc &dl, SDValue Src, const CallInst *CI); diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h index e7e87bbfebf38..b404c92e71836 100644 --- a/llvm/include/llvm/DebugInfo/DIContext.h +++ b/llvm/include/llvm/DebugInfo/DIContext.h @@ -211,6 +211,8 @@ struct DIDumpOptions { bool ShowAggregateErrors = false; bool PrintRegisterOnly = false; std::string JsonErrSummaryFile; + /// List of DWARF tags to filter children by. + llvm::SmallVector<unsigned, 0> FilterChildTag; std::function<llvm::StringRef(uint64_t DwarfRegNum, bool IsEH)> GetNameForDWARFReg; diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index 8ce2b1bea8fac..c086a39616249 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -183,6 +183,11 @@ def NoCallback : EnumAttr<"nocallback", IntersectAnd, [FnAttr]>; /// Specify how the pointer may be captured. def Captures : IntAttr<"captures", IntersectCustom, [ParamAttr]>; +/// Result will not be undef or poison if all arguments are not undef and not +/// poison. +def NoCreateUndefOrPoison + : EnumAttr<"nocreateundeforpoison", IntersectAnd, [FnAttr]>; + /// Function is not a source of divergence. def NoDivergenceSource : EnumAttr<"nodivergencesource", IntersectAnd, [FnAttr]>; diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 27930bbc651bd..8bd060ae8f485 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -3556,6 +3556,11 @@ class SwitchInstProfUpdateWrapper { /// correspondent branch weight. LLVM_ABI SwitchInst::CaseIt removeCase(SwitchInst::CaseIt I); + /// Replace the default destination by given case. Delegate the call to + /// the underlying SwitchInst::setDefaultDest and remove correspondent branch + /// weight. + LLVM_ABI void replaceDefaultDest(SwitchInst::CaseIt I); + /// Delegate the call to the underlying SwitchInst::addCase() and set the /// specified branch weight for the added case. LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest, CaseWeightOpt W); diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 4d59ee8676b9e..6a079f62dd9cf 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -186,6 +186,10 @@ def IntrSpeculatable : IntrinsicProperty; // defined by the hasSideEffects property of the TableGen Instruction class. def IntrHasSideEffects : IntrinsicProperty; +// Result will not be undef or poison if all arguments are not undef and not +// poison. +def IntrNoCreateUndefOrPoison : IntrinsicProperty; + //===----------------------------------------------------------------------===// // IIT constants and utils //===----------------------------------------------------------------------===// @@ -1039,7 +1043,7 @@ def int_experimental_memset_pattern // FIXME: Add version of these floating point intrinsics which allow non-default // rounding modes and FP exception handling. -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_fma : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; @@ -1052,16 +1056,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { // environment so they can be treated as readnone. def int_sqrt : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_powi : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_anyint_ty]>; - def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; def int_sin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_cos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_tan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_pow : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; def int_log : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; @@ -1080,12 +1076,6 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { def int_nearbyint : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_round : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; def int_roundeven : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; - def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_anyfloat_ty]>; - def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_anyfloat_ty]>; - def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_anyfloat_ty]>; // Truncate a floating point number with a specific rounding mode def int_fptrunc_round : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ], @@ -1097,6 +1087,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { def int_arithmetic_fence : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + // If the value doesn't fit an unspecified value is returned, but this + // is not poison so we can still mark these as IntrNoCreateUndefOrPoison. def int_lround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; def int_llround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; def int_lrint : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; @@ -1110,29 +1102,50 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { def int_frexp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty, llvm_anyint_ty], [LLVMMatchType<0>]>; } +// TODO: Move all of these into the IntrNoCreateUndefOrPoison case above. +let IntrProperties = [IntrNoMem, IntrSpeculatable] in { + // These functions do not read memory, but are sensitive to the + // rounding mode. LLVM purposely does not model changes to the FP + // environment so they can be treated as readnone. + def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>; + def int_tan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; + def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_anyfloat_ty]>; + def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_anyfloat_ty]>; + def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_anyfloat_ty]>; +} + def int_minnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_maxnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_minimum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_maximum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_minimumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; def int_maximumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative] + [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison] >; // Internal interface for object size checking @@ -1164,7 +1177,7 @@ let IntrProperties = [IntrInaccessibleMemOnly] in { def int_is_fpclass : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyfloat_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, ImmArg<ArgIndex<1>>]>; //===--------------- Constrained Floating Point Intrinsics ----------------===// // @@ -1406,7 +1419,7 @@ def int_expect_with_probability : DefaultAttrsIntrinsic<[llvm_anyint_ty], // // None of these intrinsics accesses memory at all. -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_bswap: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>; def int_ctpop: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>; def int_bitreverse : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>; @@ -1521,7 +1534,7 @@ def int_adjust_trampoline : DefaultAttrsIntrinsic< // // Expose the carry flag from add operations on two integrals. -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_sadd_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [LLVMMatchType<0>, LLVMMatchType<0>]>; @@ -1547,16 +1560,16 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in { // def int_sadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>; def int_uadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable, Commutative]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>; def int_ssub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_usub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_sshl_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; @@ -1611,22 +1624,22 @@ def int_abs : DefaultAttrsIntrinsic< def int_smax : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_smin : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_umax : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_umin : DefaultAttrsIntrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; def int_scmp : DefaultAttrsIntrinsic< [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>], - [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>; def int_ucmp : DefaultAttrsIntrinsic< [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>], - [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>; //===------------------------- Memory Use Markers -------------------------===// // @@ -1868,7 +1881,7 @@ def int_convert_from_fp16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i16_ } // Saturating floating point to integer intrinsics -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { +let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in { def int_fptoui_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; def int_fptosi_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>; } @@ -1890,7 +1903,7 @@ def int_fake_use : DefaultAttrsIntrinsic<[], [llvm_vararg_ty], // First argument must be pointer or vector of pointer. This is checked by the // verifier. def int_ptrmask: DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_anyint_ty], - [IntrNoMem, IntrSpeculatable]>; + [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>; // Intrinsic to wrap a thread local variable. def int_threadlocal_address : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>], diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index d6b85630eb979..9924b905aee63 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -140,6 +140,9 @@ def int_dx_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1 def int_dx_isnan : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyfloat_ty], [IntrNoMem]>; +def int_dx_legacyf16tof32 : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_float_ty>], + [llvm_anyint_ty], [IntrNoMem]>; + def int_dx_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], [IntrNoMem]>; diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td index 84026aa9d3624..1c46965d995fe 100644 --- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td +++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td @@ -1192,4 +1192,42 @@ def int_loongarch_lasx_xvstelm_w def int_loongarch_lasx_xvstelm_d : VecInt<[], [llvm_v4i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; + +// LASX and LSX conversion +def int_loongarch_lasx_cast_128_s + : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_cast_128_d + : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_cast_128 + : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128_s + : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128_d + : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128 + : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo_s + : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo_d + : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo + : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi_s + : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi_d + : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi + : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo_s + : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo_d + : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo + : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi_s + : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi_d + : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi + : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>; } // TargetPrefix = "loongarch" diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index bc51fb639fd75..f39c6cda2c579 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -200,4 +200,7 @@ def int_spv_resource_nonuniformindex def int_spv_generic_cast_to_ptr_explicit : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [generic_ptr_ty], [IntrNoMem, NoUndef<RetIndex>]>; + + def int_spv_unpackhalf2x16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i32_ty], [IntrNoMem]>; + } diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index 01359894b0421..ab14ed44fed52 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -186,6 +186,13 @@ struct RuntimeLibcallsInfo { return RTLIB::Unsupported; } + /// \returns the function type and attributes for the \p LibcallImpl, + /// depending on the target \p TT. If the function has incomplete type + /// information, return nullptr for the function type. + std::pair<FunctionType *, AttributeList> + getFunctionTy(LLVMContext &Ctx, const Triple &TT, const DataLayout &DL, + RTLIB::LibcallImpl LibcallImpl) const; + private: LLVM_ABI static iota_range<RTLIB::LibcallImpl> lookupLibcallImplNameImpl(StringRef Name); diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h index 01e7c6b07dd36..f4c1e30b097ee 100644 --- a/llvm/include/llvm/Object/MachO.h +++ b/llvm/include/llvm/Object/MachO.h @@ -447,7 +447,7 @@ class LLVM_ABI MachOObjectFile : public ObjectFile { uint64_t getSectionAddress(DataRefImpl Sec) const override; uint64_t getSectionIndex(DataRefImpl Sec) const override; uint64_t getSectionSize(DataRefImpl Sec) const override; - ArrayRef<uint8_t> getSectionContents(uint32_t Offset, uint64_t Size) const; + ArrayRef<uint8_t> getSectionContents(uint64_t Offset, uint64_t Size) const; Expected<ArrayRef<uint8_t>> getSectionContents(DataRefImpl Sec) const override; uint64_t getSectionAlignment(DataRefImpl Sec) const override; diff --git a/llvm/include/llvm/Support/Allocator.h b/llvm/include/llvm/Support/Allocator.h index bc0265904ef65..fffcbd9f3c1d8 100644 --- a/llvm/include/llvm/Support/Allocator.h +++ b/llvm/include/llvm/Support/Allocator.h @@ -380,7 +380,7 @@ class BumpPtrAllocatorImpl /// The standard BumpPtrAllocator which just uses the default template /// parameters. -typedef BumpPtrAllocatorImpl<> BumpPtrAllocator; +using BumpPtrAllocator = BumpPtrAllocatorImpl<>; /// A BumpPtrAllocator that allows only elements of a specific type to be /// allocated. diff --git a/llvm/include/llvm/Support/Atomic.h b/llvm/include/llvm/Support/Atomic.h index c2d9ae2da231c..3c62672a077f1 100644 --- a/llvm/include/llvm/Support/Atomic.h +++ b/llvm/include/llvm/Support/Atomic.h @@ -30,9 +30,9 @@ namespace llvm { LLVM_ABI void MemoryFence(); #ifdef _MSC_VER - typedef long cas_flag; + using cas_flag = long; #else - typedef uint32_t cas_flag; + using cas_flag = uint32_t; #endif LLVM_ABI cas_flag CompareAndSwap(volatile cas_flag *ptr, cas_flag new_value, cas_flag old_value); diff --git a/llvm/include/llvm/Support/BinaryStreamArray.h b/llvm/include/llvm/Support/BinaryStreamArray.h index ef2233c53ec2c..a7d03f6511f12 100644 --- a/llvm/include/llvm/Support/BinaryStreamArray.h +++ b/llvm/include/llvm/Support/BinaryStreamArray.h @@ -93,7 +93,7 @@ class VarStreamArray { friend class VarStreamArrayIterator<ValueType, Extractor>; public: - typedef VarStreamArrayIterator<ValueType, Extractor> Iterator; + using Iterator = VarStreamArrayIterator<ValueType, Extractor>; VarStreamArray() = default; @@ -156,8 +156,8 @@ template <typename ValueType, typename Extractor> class VarStreamArrayIterator : public iterator_facade_base<VarStreamArrayIterator<ValueType, Extractor>, std::forward_iterator_tag, const ValueType> { - typedef VarStreamArrayIterator<ValueType, Extractor> IterType; - typedef VarStreamArray<ValueType, Extractor> ArrayType; + using IterType = VarStreamArrayIterator<ValueType, Extractor>; + using ArrayType = VarStreamArray<ValueType, Extractor>; public: VarStreamArrayIterator(const ArrayType &Array, const Extractor &E, @@ -260,7 +260,7 @@ template <typename T> class FixedStreamArray { friend class FixedStreamArrayIterator<T>; public: - typedef FixedStreamArrayIterator<T> Iterator; + using Iterator = FixedStreamArrayIterator<T>; FixedStreamArray() = default; explicit FixedStreamArray(BinaryStreamRef Stream) : Stream(Stream) { diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h index 6f6df2e9703ea..a6435a2562a2b 100644 --- a/llvm/include/llvm/Support/Casting.h +++ b/llvm/include/llvm/Support/Casting.h @@ -816,6 +816,42 @@ template <typename... Types> struct IsaAndPresentCheckPredicate { return isa_and_present<Types...>(Val); } }; + +//===----------------------------------------------------------------------===// +// Casting Function Objects +//===----------------------------------------------------------------------===// + +/// Usable in generic algorithms like map_range +template <typename U> struct StaticCastFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return static_cast<U>(Val); + } +}; + +template <typename U> struct DynCastFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return dyn_cast<U>(Val); + } +}; + +template <typename U> struct CastFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return cast<U>(Val); + } +}; + +template <typename U> struct CastIfPresentFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return cast_if_present<U>(Val); + } +}; + +template <typename U> struct DynCastIfPresentFunc { + template <typename T> decltype(auto) operator()(T &&Val) const { + return dyn_cast_if_present<U>(Val); + } +}; + } // namespace detail /// Function object wrapper for the `llvm::isa` type check. The function call @@ -841,6 +877,20 @@ template <typename... Types> inline constexpr detail::IsaAndPresentCheckPredicate<Types...> IsaAndPresentPred{}; +/// Function objects corresponding to the Cast types defined above. +template <typename From> +inline constexpr detail::StaticCastFunc<From> StaticCastTo{}; + +template <typename From> inline constexpr detail::CastFunc<From> CastTo{}; + +template <typename From> +inline constexpr detail::CastIfPresentFunc<From> CastIfPresentTo{}; + +template <typename From> +inline constexpr detail::DynCastIfPresentFunc<From> DynCastIfPresentTo{}; + +template <typename From> inline constexpr detail::DynCastFunc<From> DynCastTo{}; + } // end namespace llvm #endif // LLVM_SUPPORT_CASTING_H diff --git a/llvm/include/llvm/Support/Chrono.h b/llvm/include/llvm/Support/Chrono.h index 5b8102d8e11cf..e5f98249cc074 100644 --- a/llvm/include/llvm/Support/Chrono.h +++ b/llvm/include/llvm/Support/Chrono.h @@ -150,10 +150,10 @@ template <> struct unit<std::nano> { template <typename Rep, typename Period> struct format_provider<std::chrono::duration<Rep, Period>> { private: - typedef std::chrono::duration<Rep, Period> Dur; - typedef std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value, - double, intmax_t> - InternalRep; + using Dur = std::chrono::duration<Rep, Period>; + using InternalRep = + std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value, + double, intmax_t>; template <typename AsPeriod> static InternalRep getAs(const Dur &D) { using namespace std::chrono; diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h index bb1723518a490..ddf7057bff59d 100644 --- a/llvm/include/llvm/Support/ConvertUTF.h +++ b/llvm/include/llvm/Support/ConvertUTF.h @@ -126,10 +126,10 @@ namespace llvm { bit mask & shift operations. ------------------------------------------------------------------------ */ -typedef unsigned int UTF32; /* at least 32 bits */ -typedef unsigned short UTF16; /* at least 16 bits */ -typedef unsigned char UTF8; /* typically 8 bits */ -typedef unsigned char Boolean; /* 0 or 1 */ +using UTF32 = unsigned int; /* at least 32 bits */ +using UTF16 = unsigned short; /* at least 16 bits */ +using UTF8 = unsigned char; /* typically 8 bits */ +using Boolean = unsigned char; /* 0 or 1 */ /* Some fundamental constants */ #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD @@ -146,17 +146,14 @@ typedef unsigned char Boolean; /* 0 or 1 */ #define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF #define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0xFFFE0000 -typedef enum { - conversionOK, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insuff. room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ -} ConversionResult; - -typedef enum { - strictConversion = 0, - lenientConversion -} ConversionFlags; +enum ConversionResult { + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ +}; + +enum ConversionFlags { strictConversion = 0, lenientConversion }; LLVM_ABI ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h index 39a08d499b67e..9904a0dd86559 100644 --- a/llvm/include/llvm/Support/DebugCounter.h +++ b/llvm/include/llvm/Support/DebugCounter.h @@ -140,7 +140,7 @@ class DebugCounter { } // Iterate through the registered counters - typedef UniqueVector<std::string> CounterVector; + using CounterVector = UniqueVector<std::string>; CounterVector::const_iterator begin() const { return RegisteredCounters.begin(); } diff --git a/llvm/include/llvm/Support/ErrorHandling.h b/llvm/include/llvm/Support/ErrorHandling.h index 4c17b6e83acd2..a4fd008a9ff3f 100644 --- a/llvm/include/llvm/Support/ErrorHandling.h +++ b/llvm/include/llvm/Support/ErrorHandling.h @@ -21,8 +21,8 @@ class StringRef; class Twine; /// An error handler callback. -typedef void (*fatal_error_handler_t)(void *user_data, const char *reason, - bool gen_crash_diag); +using fatal_error_handler_t = void (*)(void *user_data, const char *reason, + bool gen_crash_diag); /// install_fatal_error_handler - Installs a new error handler to be used /// whenever a serious (non-recoverable) error is encountered by LLVM. diff --git a/llvm/include/llvm/Support/FormatVariadicDetails.h b/llvm/include/llvm/Support/FormatVariadicDetails.h index 0fdc7b6f94da7..c0b245e297a58 100644 --- a/llvm/include/llvm/Support/FormatVariadicDetails.h +++ b/llvm/include/llvm/Support/FormatVariadicDetails.h @@ -63,8 +63,8 @@ template <typename T> class missing_format_adapter; template <class T> class has_FormatProvider { public: using Decayed = std::decay_t<T>; - typedef void (*Signature_format)(const Decayed &, llvm::raw_ostream &, - StringRef); + using Signature_format = void (*)(const Decayed &, llvm::raw_ostream &, + StringRef); template <typename U> using check = SameType<Signature_format, &U::format>; diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h index b6bb360d9868f..9e2f61fd03e78 100644 --- a/llvm/include/llvm/Support/GenericLoopInfo.h +++ b/llvm/include/llvm/Support/GenericLoopInfo.h @@ -150,9 +150,9 @@ template <class BlockT, class LoopT> class LoopBase { assert(!isInvalid() && "Loop not in a valid state!"); return SubLoops; } - typedef typename std::vector<LoopT *>::const_iterator iterator; - typedef - typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator; + using iterator = typename std::vector<LoopT *>::const_iterator; + using reverse_iterator = + typename std::vector<LoopT *>::const_reverse_iterator; iterator begin() const { return getSubLoops().begin(); } iterator end() const { return getSubLoops().end(); } reverse_iterator rbegin() const { return getSubLoops().rbegin(); } @@ -174,7 +174,7 @@ template <class BlockT, class LoopT> class LoopBase { assert(!isInvalid() && "Loop not in a valid state!"); return Blocks; } - typedef typename ArrayRef<BlockT *>::const_iterator block_iterator; + using block_iterator = typename ArrayRef<BlockT *>::const_iterator; block_iterator block_begin() const { return getBlocks().begin(); } block_iterator block_end() const { return getBlocks().end(); } inline iterator_range<block_iterator> blocks() const { @@ -302,7 +302,7 @@ template <class BlockT, class LoopT> class LoopBase { bool hasNoExitBlocks() const; /// Edge type. - typedef std::pair<BlockT *, BlockT *> Edge; + using Edge = std::pair<BlockT *, BlockT *>; /// Return all pairs of (_inside_block_,_outside_block_). void getExitEdges(SmallVectorImpl<Edge> &ExitEdges) const; @@ -575,9 +575,9 @@ template <class BlockT, class LoopT> class LoopInfoBase { /// iterator/begin/end - The interface to the top-level loops in the current /// function. /// - typedef typename std::vector<LoopT *>::const_iterator iterator; - typedef - typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator; + using iterator = typename std::vector<LoopT *>::const_iterator; + using reverse_iterator = + typename std::vector<LoopT *>::const_reverse_iterator; iterator begin() const { return TopLevelLoops.begin(); } iterator end() const { return TopLevelLoops.end(); } reverse_iterator rbegin() const { return TopLevelLoops.rbegin(); } diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h index 541678001a8ff..c830f0a67a448 100644 --- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h +++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h @@ -459,7 +459,7 @@ template <class BlockT, class LoopT> static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges, LoopInfoBase<BlockT, LoopT> *LI, const DomTreeBase<BlockT> &DomTree) { - typedef GraphTraits<Inverse<BlockT *>> InvBlockTraits; + using InvBlockTraits = GraphTraits<Inverse<BlockT *>>; unsigned NumBlocks = 0; unsigned NumSubloops = 0; @@ -513,8 +513,8 @@ static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges, /// Populate all loop data in a stable order during a single forward DFS. template <class BlockT, class LoopT> class PopulateLoopsDFS { - typedef GraphTraits<BlockT *> BlockTraits; - typedef typename BlockTraits::ChildIteratorType SuccIterTy; + using BlockTraits = GraphTraits<BlockT *>; + using SuccIterTy = typename BlockTraits::ChildIteratorType; LoopInfoBase<BlockT, LoopT> *LI; diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h index 4ba386753f397..dbcb66d7680e4 100644 --- a/llvm/include/llvm/Support/MD5.h +++ b/llvm/include/llvm/Support/MD5.h @@ -90,7 +90,7 @@ class MD5 { private: // Any 32-bit or wider unsigned integer data type will do. - typedef uint32_t MD5_u32plus; + using MD5_u32plus = uint32_t; // Internal State struct { diff --git a/llvm/include/llvm/Support/Mutex.h b/llvm/include/llvm/Support/Mutex.h index d61e3fd96efbe..3ca5c9a2f6be8 100644 --- a/llvm/include/llvm/Support/Mutex.h +++ b/llvm/include/llvm/Support/Mutex.h @@ -63,12 +63,12 @@ namespace llvm }; /// Mutex - A standard, always enforced mutex. - typedef SmartMutex<false> Mutex; + using Mutex = SmartMutex<false>; template <bool mt_only> using SmartScopedLock = std::lock_guard<SmartMutex<mt_only>>; - typedef SmartScopedLock<false> ScopedLock; + using ScopedLock = SmartScopedLock<false>; } } diff --git a/llvm/include/llvm/Support/OnDiskHashTable.h b/llvm/include/llvm/Support/OnDiskHashTable.h index d7d72cfbbc649..54c6b713478b9 100644 --- a/llvm/include/llvm/Support/OnDiskHashTable.h +++ b/llvm/include/llvm/Support/OnDiskHashTable.h @@ -69,7 +69,7 @@ template <typename Info> class OnDiskChainedHashTableGenerator { : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {} }; - typedef typename Info::offset_type offset_type; + using offset_type = typename Info::offset_type; offset_type NumBuckets; offset_type NumEntries; llvm::SpecificBumpPtrAllocator<Item> BA; @@ -278,12 +278,12 @@ template <typename Info> class OnDiskChainedHashTable { Info InfoObj; public: - typedef Info InfoType; - typedef typename Info::internal_key_type internal_key_type; - typedef typename Info::external_key_type external_key_type; - typedef typename Info::data_type data_type; - typedef typename Info::hash_value_type hash_value_type; - typedef typename Info::offset_type offset_type; + using InfoType = Info; + using internal_key_type = typename Info::internal_key_type; + using external_key_type = typename Info::external_key_type; + using data_type = typename Info::data_type; + using hash_value_type = typename Info::hash_value_type; + using offset_type = typename Info::offset_type; OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries, const unsigned char *Buckets, @@ -435,12 +435,12 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { const unsigned char *Payload; public: - typedef OnDiskChainedHashTable<Info> base_type; - typedef typename base_type::internal_key_type internal_key_type; - typedef typename base_type::external_key_type external_key_type; - typedef typename base_type::data_type data_type; - typedef typename base_type::hash_value_type hash_value_type; - typedef typename base_type::offset_type offset_type; + using base_type = OnDiskChainedHashTable<Info>; + using internal_key_type = typename base_type::internal_key_type; + using external_key_type = typename base_type::external_key_type; + using data_type = typename base_type::data_type; + using hash_value_type = typename base_type::hash_value_type; + using offset_type = typename base_type::offset_type; private: /// Iterates over all of the keys in the table. @@ -450,7 +450,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { offset_type NumEntriesLeft; public: - typedef external_key_type value_type; + using value_type = external_key_type; iterator_base(const unsigned char *const Ptr, offset_type NumEntries) : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries) {} @@ -505,7 +505,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { Info *InfoObj; public: - typedef external_key_type value_type; + using value_type = external_key_type; key_iterator(const unsigned char *const Ptr, offset_type NumEntries, Info *InfoObj) @@ -551,7 +551,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> { Info *InfoObj; public: - typedef data_type value_type; + using value_type = data_type; data_iterator(const unsigned char *const Ptr, offset_type NumEntries, Info *InfoObj) diff --git a/llvm/include/llvm/Support/PointerLikeTypeTraits.h b/llvm/include/llvm/Support/PointerLikeTypeTraits.h index 320f6b63b447e..a47d68406acf3 100644 --- a/llvm/include/llvm/Support/PointerLikeTypeTraits.h +++ b/llvm/include/llvm/Support/PointerLikeTypeTraits.h @@ -70,7 +70,7 @@ template <> struct PointerLikeTypeTraits<void *> { // Provide PointerLikeTypeTraits for const things. template <typename T> struct PointerLikeTypeTraits<const T> { - typedef PointerLikeTypeTraits<T> NonConst; + using NonConst = PointerLikeTypeTraits<T>; static inline const void *getAsVoidPointer(const T P) { return NonConst::getAsVoidPointer(P); @@ -83,7 +83,7 @@ template <typename T> struct PointerLikeTypeTraits<const T> { // Provide PointerLikeTypeTraits for const pointers. template <typename T> struct PointerLikeTypeTraits<const T *> { - typedef PointerLikeTypeTraits<T *> NonConst; + using NonConst = PointerLikeTypeTraits<T *>; static inline const void *getAsVoidPointer(const T *P) { return NonConst::getAsVoidPointer(const_cast<T *>(P)); diff --git a/llvm/include/llvm/Support/Program.h b/llvm/include/llvm/Support/Program.h index 53c2e7597b2b4..575e416587ea8 100644 --- a/llvm/include/llvm/Support/Program.h +++ b/llvm/include/llvm/Support/Program.h @@ -39,8 +39,8 @@ const char EnvPathSeparator = ';'; typedef unsigned long procid_t; // Must match the type of DWORD on Windows. typedef void *process_t; // Must match the type of HANDLE on Windows. #else -typedef ::pid_t procid_t; -typedef procid_t process_t; +using procid_t = ::pid_t; +using process_t = procid_t; #endif /// This struct encapsulates information about a process. diff --git a/llvm/include/llvm/Support/RISCVISAUtils.h b/llvm/include/llvm/Support/RISCVISAUtils.h index 165bb08d66431..05fd32e0e7cfe 100644 --- a/llvm/include/llvm/Support/RISCVISAUtils.h +++ b/llvm/include/llvm/Support/RISCVISAUtils.h @@ -40,8 +40,8 @@ struct ExtensionComparator { /// OrderedExtensionMap is std::map, it's specialized to keep entries /// in canonical order of extension. -typedef std::map<std::string, ExtensionVersion, ExtensionComparator> - OrderedExtensionMap; +using OrderedExtensionMap = + std::map<std::string, ExtensionVersion, ExtensionComparator>; } // namespace RISCVISAUtils diff --git a/llvm/include/llvm/Support/RWMutex.h b/llvm/include/llvm/Support/RWMutex.h index 8d221aaab9ab9..efc1ca19a1208 100644 --- a/llvm/include/llvm/Support/RWMutex.h +++ b/llvm/include/llvm/Support/RWMutex.h @@ -162,7 +162,7 @@ template <bool mt_only> class SmartRWMutex { bool try_lock() { return impl.try_lock(); } }; -typedef SmartRWMutex<false> RWMutex; +using RWMutex = SmartRWMutex<false>; /// ScopedReader - RAII acquisition of a reader lock #if !defined(LLVM_USE_RW_MUTEX_IMPL) @@ -179,7 +179,7 @@ template <bool mt_only> struct SmartScopedReader { ~SmartScopedReader() { mutex.unlock_shared(); } }; #endif -typedef SmartScopedReader<false> ScopedReader; +using ScopedReader = SmartScopedReader<false>; /// ScopedWriter - RAII acquisition of a writer lock #if !defined(LLVM_USE_RW_MUTEX_IMPL) @@ -196,7 +196,7 @@ template <bool mt_only> struct SmartScopedWriter { ~SmartScopedWriter() { mutex.unlock(); } }; #endif -typedef SmartScopedWriter<false> ScopedWriter; +using ScopedWriter = SmartScopedWriter<false>; } // end namespace sys } // end namespace llvm diff --git a/llvm/include/llvm/Support/Registry.h b/llvm/include/llvm/Support/Registry.h index c02f15e5e32b8..acd3b06fde6e7 100644 --- a/llvm/include/llvm/Support/Registry.h +++ b/llvm/include/llvm/Support/Registry.h @@ -43,8 +43,8 @@ namespace llvm { template <typename T> class Registry { public: - typedef T type; - typedef SimpleRegistryEntry<T> entry; + using type = T; + using entry = SimpleRegistryEntry<T>; class node; class iterator; diff --git a/llvm/include/llvm/Support/ScaledNumber.h b/llvm/include/llvm/Support/ScaledNumber.h index 07baf153e10c6..8ca8d457e339e 100644 --- a/llvm/include/llvm/Support/ScaledNumber.h +++ b/llvm/include/llvm/Support/ScaledNumber.h @@ -498,10 +498,10 @@ template <class DigitsT> class ScaledNumber : ScaledNumberBase { static_assert(!std::numeric_limits<DigitsT>::is_signed, "only unsigned floats supported"); - typedef DigitsT DigitsType; + using DigitsType = DigitsT; private: - typedef std::numeric_limits<DigitsType> DigitsLimits; + using DigitsLimits = std::numeric_limits<DigitsType>; static constexpr int Width = sizeof(DigitsType) * 8; static_assert(Width <= 64, "invalid integer width for digits"); @@ -782,7 +782,7 @@ uint64_t ScaledNumber<DigitsT>::scale(uint64_t N) const { template <class DigitsT> template <class IntT> IntT ScaledNumber<DigitsT>::toInt() const { - typedef std::numeric_limits<IntT> Limits; + using Limits = std::numeric_limits<IntT>; if (*this < 1) return 0; if (*this >= Limits::max()) diff --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h index 4c78235abf508..eac66d84d6f63 100644 --- a/llvm/include/llvm/Support/SuffixTree.h +++ b/llvm/include/llvm/Support/SuffixTree.h @@ -219,7 +219,7 @@ class SuffixTree { } }; - typedef RepeatedSubstringIterator iterator; + using iterator = RepeatedSubstringIterator; iterator begin() { return iterator(Root, LeafNodes); } iterator end() { return iterator(nullptr); } }; diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h index 88846807f111a..89d90b3438e92 100644 --- a/llvm/include/llvm/Support/Threading.h +++ b/llvm/include/llvm/Support/Threading.h @@ -53,7 +53,7 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; } #if LLVM_THREADING_USE_STD_CALL_ONCE - typedef std::once_flag once_flag; +using once_flag = std::once_flag; #else diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h index c47976524dcd9..218c2e336d77b 100644 --- a/llvm/include/llvm/Support/TrailingObjects.h +++ b/llvm/include/llvm/Support/TrailingObjects.h @@ -76,7 +76,7 @@ class TrailingObjectsBase { // number of a different type. e.g.: // ExtractSecondType<Foo..., int>::type template <typename Ty1, typename Ty2> struct ExtractSecondType { - typedef Ty2 type; + using type = Ty2; }; // TrailingObjectsImpl is somewhat complicated, because it is a @@ -101,8 +101,8 @@ class TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, PrevTy, NextTy, : public TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...> { - typedef TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...> - ParentType; + using ParentType = + TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...>; struct RequiresRealignment { static const bool value = alignof(PrevTy) < alignof(NextTy); diff --git a/llvm/include/llvm/Support/UnicodeCharRanges.h b/llvm/include/llvm/Support/UnicodeCharRanges.h index 7f1a9b3ff0c3b..2b5fc83d34690 100644 --- a/llvm/include/llvm/Support/UnicodeCharRanges.h +++ b/llvm/include/llvm/Support/UnicodeCharRanges.h @@ -37,7 +37,7 @@ inline bool operator<(UnicodeCharRange Range, uint32_t Value) { /// array. class UnicodeCharSet { public: - typedef ArrayRef<UnicodeCharRange> CharRanges; + using CharRanges = ArrayRef<UnicodeCharRange>; /// Constructs a UnicodeCharSet instance from an array of /// UnicodeCharRanges. diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h index e15a98dc5a677..ffad1241c3e3d 100644 --- a/llvm/include/llvm/Support/float128.h +++ b/llvm/include/llvm/Support/float128.h @@ -14,7 +14,7 @@ namespace llvm { #if defined(__clang__) && defined(__FLOAT128__) && \ defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__) #define HAS_IEE754_FLOAT128 -typedef __float128 float128; +using float128 = __float128; #elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) && \ !defined(__LONG_DOUBLE_IBM128__) && \ (defined(__GNUC__) || defined(__GNUG__)) diff --git a/llvm/include/llvm/Support/thread.h b/llvm/include/llvm/Support/thread.h index 16e322bfd8785..ecde62d8368e7 100644 --- a/llvm/include/llvm/Support/thread.h +++ b/llvm/include/llvm/Support/thread.h @@ -127,7 +127,7 @@ LLVM_ABI thread::id llvm_thread_get_current_id_impl(); template <class Function, class... Args> thread::thread(std::optional<unsigned> StackSizeInBytes, Function &&f, Args &&...args) { - typedef std::tuple<std::decay_t<Function>, std::decay_t<Args>...> CalleeTuple; + using CalleeTuple = std::tuple<std::decay_t<Function>, std::decay_t<Args>...>; std::unique_ptr<CalleeTuple> Callee( new CalleeTuple(std::forward<Function>(f), std::forward<Args>(args)...)); diff --git a/llvm/include/llvm/TableGen/CodeGenHelpers.h b/llvm/include/llvm/TableGen/CodeGenHelpers.h index e22c6d4f6d390..95866e306b5ff 100644 --- a/llvm/include/llvm/TableGen/CodeGenHelpers.h +++ b/llvm/include/llvm/TableGen/CodeGenHelpers.h @@ -20,6 +20,7 @@ #include <string> namespace llvm { + // Simple RAII helper for emitting ifdef-undef-endif scope. class IfDefEmitter { public: @@ -57,7 +58,7 @@ class NamespaceEmitter { NamespaceEmitter(raw_ostream &OS, StringRef NameUntrimmed) : Name(trim(NameUntrimmed).str()), OS(OS) { if (!Name.empty()) - OS << "namespace " << Name << " {\n"; + OS << "namespace " << Name << " {\n\n"; } ~NamespaceEmitter() { close(); } @@ -65,7 +66,7 @@ class NamespaceEmitter { // Explicit function to close the namespace scopes. void close() { if (!Closed && !Name.empty()) - OS << "} // namespace " << Name << "\n"; + OS << "\n} // namespace " << Name << "\n"; Closed = true; } diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index a013f27766051..eb35e3644bd02 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -5339,6 +5339,19 @@ struct AAPotentialConstantValues return nullptr; } + /// Return the minimum trailing zeros of potential constants + unsigned getAssumedMinTrailingZeros() const { + if (!isValidState() || getAssumedSet().empty()) + return 0; + unsigned TrailingZeros = getAssumedSet().begin()->getBitWidth() + 1; + for (const APInt &It : getAssumedSet()) { + if (It.countTrailingZeros() < TrailingZeros) + TrailingZeros = It.countTrailingZeros(); + } + if (TrailingZeros > getAssumedSet().begin()->getBitWidth()) + return 0; + return TrailingZeros; + } /// See AbstractAttribute::getName() StringRef getName() const override { return "AAPotentialConstantValues"; } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index e27a9b1c44014..5d88e5f54e3d6 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -806,11 +806,11 @@ class AccessAnalysis { typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList; AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI, - MemoryDepChecker::DepCandidates &DA, + DominatorTree &DT, MemoryDepChecker::DepCandidates &DA, PredicatedScalarEvolution &PSE, SmallPtrSetImpl<MDNode *> &LoopAliasScopes) - : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), - LoopAliasScopes(LoopAliasScopes) { + : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA), + PSE(PSE), LoopAliasScopes(LoopAliasScopes) { // We're analyzing dependences across loop iterations. BAA.enableCrossIterationMode(); } @@ -934,6 +934,9 @@ class AccessAnalysis { /// The LoopInfo of the loop being checked. const LoopInfo *LI; + /// The dominator tree of the function. + DominatorTree &DT; + /// Sets of potentially dependent accesses - members of one set share an /// underlying pointer. The set "CheckDeps" identfies which sets really need a /// dependence check. @@ -1015,6 +1018,7 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, /// informating from the IR pointer value to determine no-wrap. static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, + const DominatorTree &DT, std::optional<int64_t> Stride = std::nullopt) { // FIXME: This should probably only return true for NUW. if (AR->getNoWrapFlags(SCEV::NoWrapMask)) @@ -1029,8 +1033,18 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, // case, the GEP would be poison and any memory access dependent on it would // be immediate UB when executed. if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr); - GEP && GEP->hasNoUnsignedSignedWrap()) - return true; + GEP && GEP->hasNoUnsignedSignedWrap()) { + // For the above reasoning to apply, the pointer must be dereferenced in + // every iteration. + if (L->getHeader() == L->getLoopLatch() || + any_of(GEP->users(), [L, &DT, GEP](User *U) { + if (getLoadStorePointerOperand(U) != GEP) + return false; + BasicBlock *UserBB = cast<Instruction>(U)->getParent(); + return !LoopAccessInfo::blockNeedsPredication(UserBB, L, &DT); + })) + return true; + } if (!Stride) Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE); @@ -1293,7 +1307,7 @@ bool AccessAnalysis::createCheckForAccess( } if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy, - TheLoop, Assume)) + TheLoop, Assume, DT)) return false; } @@ -1606,7 +1620,7 @@ void AccessAnalysis::processMemAccesses() { /// Check whether the access through \p Ptr has a constant stride. std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, - const Loop *Lp, + const Loop *Lp, const DominatorTree &DT, const DenseMap<Value *, const SCEV *> &StridesMap, bool Assume, bool ShouldCheckWrap) { const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); @@ -1630,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, if (!ShouldCheckWrap || !Stride) return Stride; - if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride)) + if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, DT, Stride)) return Stride; LLVM_DEBUG( @@ -2047,10 +2061,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( BPtr->getType()->getPointerAddressSpace()) return MemoryDepChecker::Dependence::Unknown; - std::optional<int64_t> StrideAPtr = - getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true); - std::optional<int64_t> StrideBPtr = - getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true); + std::optional<int64_t> StrideAPtr = getPtrStride( + PSE, ATy, APtr, InnermostLoop, *DT, SymbolicStrides, true, true); + std::optional<int64_t> StrideBPtr = getPtrStride( + PSE, BTy, BPtr, InnermostLoop, *DT, SymbolicStrides, true, true); const SCEV *Src = PSE.getSCEV(APtr); const SCEV *Sink = PSE.getSCEV(BPtr); @@ -2627,7 +2641,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, } MemoryDepChecker::DepCandidates DepCands; - AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes); + AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE, + LoopAliasScopes); // Holds the analyzed pointers. We don't want to call getUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -2691,7 +2706,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, bool IsReadOnlyPtr = false; Type *AccessTy = getLoadStoreType(LD); if (Seen.insert({Ptr, AccessTy}).second || - !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) { + !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false, + true)) { ++NumReads; IsReadOnlyPtr = true; } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 0a72076f51824..789a98366cead 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7419,84 +7419,20 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind, if (cast<ConstantInt>(II->getArgOperand(1))->isNullValue()) return false; break; - case Intrinsic::ctpop: - case Intrinsic::bswap: - case Intrinsic::bitreverse: - case Intrinsic::fshl: - case Intrinsic::fshr: - case Intrinsic::smax: - case Intrinsic::smin: - case Intrinsic::scmp: - case Intrinsic::umax: - case Intrinsic::umin: - case Intrinsic::ucmp: - case Intrinsic::ptrmask: - case Intrinsic::fptoui_sat: - case Intrinsic::fptosi_sat: - case Intrinsic::sadd_with_overflow: - case Intrinsic::ssub_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::uadd_with_overflow: - case Intrinsic::usub_with_overflow: - case Intrinsic::umul_with_overflow: - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::ssub_sat: - case Intrinsic::usub_sat: - return false; case Intrinsic::sshl_sat: case Intrinsic::ushl_sat: - return includesPoison(Kind) && - !shiftAmountKnownInRange(II->getArgOperand(1)); - case Intrinsic::fma: - case Intrinsic::fmuladd: - case Intrinsic::sqrt: - case Intrinsic::powi: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::pow: - case Intrinsic::log: - case Intrinsic::log10: - case Intrinsic::log2: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::exp10: - case Intrinsic::fabs: - case Intrinsic::copysign: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::nearbyint: - case Intrinsic::round: - case Intrinsic::roundeven: - case Intrinsic::fptrunc_round: - case Intrinsic::canonicalize: - case Intrinsic::arithmetic_fence: - case Intrinsic::minnum: - case Intrinsic::maxnum: - case Intrinsic::minimum: - case Intrinsic::maximum: - case Intrinsic::minimumnum: - case Intrinsic::maximumnum: - case Intrinsic::is_fpclass: - case Intrinsic::ldexp: - case Intrinsic::frexp: - return false; - case Intrinsic::lround: - case Intrinsic::llround: - case Intrinsic::lrint: - case Intrinsic::llrint: - // If the value doesn't fit an unspecified value is returned (but this - // is not poison). - return false; + if (!includesPoison(Kind) || + shiftAmountKnownInRange(II->getArgOperand(1))) + return false; + break; } } [[fallthrough]]; case Instruction::CallBr: case Instruction::Invoke: { const auto *CB = cast<CallBase>(Op); - return !CB->hasRetAttr(Attribute::NoUndef); + return !CB->hasRetAttr(Attribute::NoUndef) && + !CB->hasFnAttr(Attribute::NoCreateUndefOrPoison); } case Instruction::InsertElement: case Instruction::ExtractElement: { @@ -10405,3 +10341,55 @@ const Value *llvm::stripNullTest(const Value *V) { Value *llvm::stripNullTest(Value *V) { return const_cast<Value *>(stripNullTest(const_cast<const Value *>(V))); } + +bool llvm::collectPossibleValues(const Value *V, + SmallPtrSetImpl<const Constant *> &Constants, + unsigned MaxCount, bool AllowUndefOrPoison) { + SmallPtrSet<const Instruction *, 8> Visited; + SmallVector<const Instruction *, 8> Worklist; + auto Push = [&](const Value *V) -> bool { + if (auto *C = dyn_cast<Constant>(V)) { + if (!AllowUndefOrPoison && !isGuaranteedNotToBeUndefOrPoison(C)) + return false; + // Check existence first to avoid unnecessary allocations. + if (Constants.contains(C)) + return true; + if (Constants.size() == MaxCount) + return false; + Constants.insert(C); + return true; + } + + if (auto *Inst = dyn_cast<Instruction>(V)) { + if (Visited.insert(Inst).second) + Worklist.push_back(Inst); + return true; + } + return false; + }; + if (!Push(V)) + return false; + while (!Worklist.empty()) { + const Instruction *CurInst = Worklist.pop_back_val(); + switch (CurInst->getOpcode()) { + case Instruction::Select: + if (!Push(CurInst->getOperand(1))) + return false; + if (!Push(CurInst->getOperand(2))) + return false; + break; + case Instruction::PHI: + for (Value *IncomingValue : cast<PHINode>(CurInst)->incoming_values()) { + // Fast path for recurrence PHI. + if (IncomingValue == CurInst) + continue; + if (!Push(IncomingValue)) + return false; + } + break; + default: + return false; + } + } + return true; +} diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 091d94843698c..977ed59e09243 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1387,9 +1387,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // wrap around the address space we would do a memory access at nullptr // even without the transformation. The wrapping checks are therefore // deferred until after we've formed the interleaved groups. - int64_t Stride = - getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides, - /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0); + int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false) + .value_or(0); const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, @@ -1643,8 +1643,9 @@ void InterleavedAccessInfo::analyzeInterleaving( assert(Member && "Group member does not exist"); Value *MemberPtr = getLoadStorePointerOperand(Member); Type *AccessTy = getLoadStoreType(Member); - if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides, - /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0)) + if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides, + /*Assume=*/false, /*ShouldCheckWrap=*/true) + .value_or(0)) return false; LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " << FirstOrLast diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt index 4b2debb7ae236..0c8af1e7a4565 100644 --- a/llvm/lib/BinaryFormat/CMakeLists.txt +++ b/llvm/lib/BinaryFormat/CMakeLists.txt @@ -6,7 +6,6 @@ add_llvm_component_library(LLVMBinaryFormat ELF.cpp MachO.cpp Magic.cpp - Minidump.cpp MsgPackDocument.cpp MsgPackDocumentYAML.cpp MsgPackReader.cpp diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 466dcb02696f4..8930d64de5e37 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2257,6 +2257,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::Captures; case bitc::ATTR_KIND_DEAD_ON_RETURN: return Attribute::DeadOnReturn; + case bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON: + return Attribute::NoCreateUndefOrPoison; } } @@ -8566,16 +8568,13 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() { } static Expected<std::pair<bool, bool>> -getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, - unsigned ID, - BitcodeLTOInfo <OInfo) { +getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) { if (Error Err = Stream.EnterSubBlock(ID)) return std::move(Err); - SmallVector<uint64_t, 64> Record; + SmallVector<uint64_t, 64> Record; while (true) { BitstreamEntry Entry; - std::pair<bool, bool> Result = {false,false}; if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry)) return std::move(E); @@ -8584,8 +8583,8 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, case BitstreamEntry::Error: return error("Malformed block"); case BitstreamEntry::EndBlock: { - // If no flags record found, set both flags to false. - return Result; + // If no flags record found, return both flags as false. + return std::make_pair(false, false); } case BitstreamEntry::Record: // The interesting case. @@ -8607,9 +8606,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, bool EnableSplitLTOUnit = Flags & 0x8; bool UnifiedLTO = Flags & 0x200; - Result = {EnableSplitLTOUnit, UnifiedLTO}; - - return Result; + return std::make_pair(EnableSplitLTOUnit, UnifiedLTO); } } } @@ -8638,26 +8635,15 @@ Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() { /*EnableSplitLTOUnit=*/false, /*UnifiedLTO=*/false}; case BitstreamEntry::SubBlock: - if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID) { - BitcodeLTOInfo LTOInfo; + if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID || + Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); + getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID); if (!Flags) return Flags.takeError(); - std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = true; - LTOInfo.HasSummary = true; - return LTOInfo; - } - - if (Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) { BitcodeLTOInfo LTOInfo; - Expected<std::pair<bool, bool>> Flags = - getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo); - if (!Flags) - return Flags.takeError(); std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get(); - LTOInfo.IsThinLTO = false; + LTOInfo.IsThinLTO = (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID); LTOInfo.HasSummary = true; return LTOInfo; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index f17656c7c3b03..76494c792ac7b 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -956,6 +956,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_CAPTURES; case Attribute::DeadOnReturn: return bitc::ATTR_KIND_DEAD_ON_RETURN; + case Attribute::NoCreateUndefOrPoison: + return bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp index 571c5b3ca5b4b..003c850275ff4 100644 --- a/llvm/lib/CAS/ActionCaches.cpp +++ b/llvm/lib/CAS/ActionCaches.cpp @@ -13,7 +13,11 @@ #include "BuiltinCAS.h" #include "llvm/ADT/TrieRawHashMap.h" #include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Config/llvm-config.h" #include "llvm/Support/BLAKE3.h" +#include "llvm/Support/Errc.h" #define DEBUG_TYPE "cas-action-caches" @@ -47,12 +51,54 @@ class InMemoryActionCache final : public ActionCache { Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, bool CanBeDistributed) const final; + Error validate() const final { + return createStringError("InMemoryActionCache doesn't support validate()"); + } + private: using DataT = CacheEntry<sizeof(HashType)>; using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>; InMemoryCacheT Cache; }; + +/// Builtin basic OnDiskActionCache that uses one underlying OnDiskKeyValueDB. +class OnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, + bool CanBeDistributed) const final; + + static Expected<std::unique_ptr<OnDiskActionCache>> create(StringRef Path); + + Error validate() const final; + +private: + static StringRef getHashName() { return "BLAKE3"; } + + OnDiskActionCache(std::unique_ptr<ondisk::OnDiskKeyValueDB> DB); + + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; + using DataT = CacheEntry<sizeof(HashType)>; +}; + +/// Builtin unified ActionCache that wraps around UnifiedOnDiskCache to provide +/// access to its ActionCache. +class UnifiedOnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey, + bool CanBeDistributed) const final; + + UnifiedOnDiskActionCache(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + + Error validate() const final; + +private: + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; +}; } // end namespace static Error createResultCachePoisonedError(ArrayRef<uint8_t> KeyHash, @@ -99,3 +145,123 @@ std::unique_ptr<ActionCache> createInMemoryActionCache() { } } // namespace llvm::cas + +OnDiskActionCache::OnDiskActionCache( + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + DB(std::move(DB)) {} + +Expected<std::unique_ptr<OnDiskActionCache>> +OnDiskActionCache::create(StringRef AbsPath) { + std::unique_ptr<ondisk::OnDiskKeyValueDB> DB; + if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(), + sizeof(HashType), getHashName(), + sizeof(DataT)) + .moveInto(DB)) + return std::move(E); + return std::unique_ptr<OnDiskActionCache>( + new OnDiskActionCache(std::move(DB))); +} + +Expected<std::optional<CASID>> +OnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, + bool /*CanBeDistributed*/) const { + std::optional<ArrayRef<char>> Val; + if (Error E = DB->get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + return CASID::create(&getContext(), toStringRef(*Val)); +} + +Error OnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result, + bool /*CanBeDistributed*/) { + auto ResultHash = Result.getHash(); + ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size()); + ArrayRef<char> Observed; + if (Error E = DB->put(Key, Expected).moveInto(Observed)) + return E; + + if (Expected == Observed) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, + ArrayRef((const uint8_t *)Observed.data(), Observed.size())); +} + +Error OnDiskActionCache::validate() const { + // FIXME: without the matching CAS there is nothing we can check about the + // cached values. The hash size is already validated by the DB validator. + return DB->validate(nullptr); +} + +UnifiedOnDiskActionCache::UnifiedOnDiskActionCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + UniDB(std::move(UniDB)) {} + +Expected<std::optional<CASID>> +UnifiedOnDiskActionCache::getImpl(ArrayRef<uint8_t> Key, + bool /*CanBeDistributed*/) const { + std::optional<ArrayRef<char>> Val; + if (Error E = UniDB->getKeyValueDB().get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Val); + return CASID::create(&getContext(), + toStringRef(UniDB->getGraphDB().getDigest(ID))); +} + +Error UnifiedOnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, + const CASID &Result, + bool /*CanBeDistributed*/) { + auto Expected = UniDB->getGraphDB().getReference(Result.getHash()); + if (LLVM_UNLIKELY(!Expected)) + return Expected.takeError(); + + auto Value = ondisk::UnifiedOnDiskCache::getValueFromObjectID(*Expected); + std::optional<ArrayRef<char>> Observed; + if (Error E = UniDB->getKeyValueDB().put(Key, Value).moveInto(Observed)) + return E; + + auto ObservedID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Observed); + if (*Expected == ObservedID) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, UniDB->getGraphDB().getDigest(ObservedID)); +} + +Error UnifiedOnDiskActionCache::validate() const { + auto ValidateRef = [](FileOffset Offset, ArrayRef<char> Value) -> Error { + auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(Value); + auto formatError = [&](Twine Msg) { + return createStringError( + llvm::errc::illegal_byte_sequence, + "bad record at 0x" + + utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " + + Msg.str()); + }; + if (ID.getOpaqueData() == 0) + return formatError("zero is not a valid ref"); + return Error::success(); + }; + return UniDB->getKeyValueDB().validate(ValidateRef); +} + +Expected<std::unique_ptr<ActionCache>> +cas::createOnDiskActionCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return OnDiskActionCache::create(Path); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} + +std::unique_ptr<ActionCache> +cas::builtin::createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { + return std::make_unique<UnifiedOnDiskActionCache>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp index 73646ad2c3528..e9bc6d8beed4e 100644 --- a/llvm/lib/CAS/BuiltinCAS.cpp +++ b/llvm/lib/CAS/BuiltinCAS.cpp @@ -9,6 +9,7 @@ #include "BuiltinCAS.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/Process.h" using namespace llvm; @@ -68,7 +69,7 @@ Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs, Refs, Data); } -Error BuiltinCAS::validate(const CASID &ID) { +Error BuiltinCAS::validateObject(const CASID &ID) { auto Ref = getReference(ID); if (!Ref) return createUnknownObjectError(ID); @@ -92,3 +93,14 @@ Error BuiltinCAS::validate(const CASID &ID) { return Error::success(); } + +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt, + BuiltinCASContext::getHashName(), + sizeof(HashType)); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h index 3b5374d5e1850..4d2de66cf636f 100644 --- a/llvm/lib/CAS/BuiltinCAS.h +++ b/llvm/lib/CAS/BuiltinCAS.h @@ -1,4 +1,4 @@ -//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,6 +15,9 @@ namespace llvm::cas { class ActionCache; +namespace ondisk { +class UnifiedOnDiskCache; +} // namespace ondisk namespace builtin { /// Common base class for builtin CAS implementations using the same CASContext. @@ -65,9 +68,27 @@ class BuiltinCAS : public ObjectStore { "corrupt storage"); } - Error validate(const CASID &ID) final; + Error validateObject(const CASID &ID) final; }; +/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing. +Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>> +createBuiltinUnifiedOnDiskCache(StringRef Path); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ObjectStore> createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr<ActionCache> createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB); + +// FIXME: Proxy not portable. Maybe also error-prone? +constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default"; +constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default"; + } // end namespace builtin } // end namespace llvm::cas diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp new file mode 100644 index 0000000000000..f3f6fa043bc52 --- /dev/null +++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "BuiltinCAS.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" + +using namespace llvm; +using namespace llvm::cas; + +Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>> +cas::createOnDiskUnifiedCASDatabases(StringRef Path) { + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB; + if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB)) + return std::move(E); + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB)); + return std::make_pair(std::move(CAS), std::move(AC)); +} + +Expected<ValidationResult> cas::validateOnDiskUnifiedCASDatabasesIfNeeded( + StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinary) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::validateIfNeeded( + Path, builtin::BuiltinCASContext::getHashName(), + sizeof(builtin::HashType), CheckHash, AllowRecovery, ForceValidation, + LLVMCasBinary); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt index a2f8c49e50145..aad77dce370d8 100644 --- a/llvm/lib/CAS/CMakeLists.txt +++ b/llvm/lib/CAS/CMakeLists.txt @@ -2,15 +2,18 @@ add_llvm_component_library(LLVMCAS ActionCache.cpp ActionCaches.cpp BuiltinCAS.cpp + BuiltinUnifiedCASDatabases.cpp DatabaseFile.cpp InMemoryCAS.cpp MappedFileRegionArena.cpp ObjectStore.cpp + OnDiskCAS.cpp OnDiskCommon.cpp OnDiskDataAllocator.cpp OnDiskGraphDB.cpp OnDiskKeyValueDB.cpp OnDiskTrieRawHashMap.cpp + UnifiedOnDiskCache.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp index c63ee70de0849..2d4eedd5bdc8f 100644 --- a/llvm/lib/CAS/InMemoryCAS.cpp +++ b/llvm/lib/CAS/InMemoryCAS.cpp @@ -233,6 +233,12 @@ class InMemoryCAS : public BuiltinCAS { return cast<InMemoryObject>(asInMemoryObject(Node)).getData(); } + void print(raw_ostream &OS) const final; + + Error validate(bool CheckHash) const final { + return createStringError("InMemoryCAS doesn't support validate()"); + } + InMemoryCAS() = default; private: @@ -271,6 +277,8 @@ ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const { return cast<InMemoryInlineObject>(this)->getRefsImpl(); } +void InMemoryCAS::print(raw_ostream &OS) const {} + Expected<ObjectRef> InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash, sys::fs::mapped_file_region Map) { diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp index e0be50bbe013a..3110577e03774 100644 --- a/llvm/lib/CAS/ObjectStore.cpp +++ b/llvm/lib/CAS/ObjectStore.cpp @@ -1,4 +1,4 @@ -//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,7 +12,7 @@ #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" -#include <optional> +#include <deque> using namespace llvm; using namespace llvm::cas; @@ -21,6 +21,7 @@ void CASContext::anchor() {} void ObjectStore::anchor() {} LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); } @@ -141,7 +142,7 @@ Error ObjectStore::validateTree(ObjectRef Root) { auto [I, Inserted] = ValidatedRefs.insert(Ref); if (!Inserted) continue; // already validated. - if (Error E = validate(getID(Ref))) + if (Error E = validateObject(getID(Ref))) return E; Expected<ObjectHandle> Obj = load(Ref); if (!Obj) @@ -155,6 +156,92 @@ Error ObjectStore::validateTree(ObjectRef Root) { return Error::success(); } +Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream, + ObjectRef Other) { + // Copy the full CAS tree from upstream with depth-first ordering to ensure + // all the child nodes are available in downstream CAS before inserting + // current object. This uses a similar algorithm as + // `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema + // so it can be used to import from any other ObjectStore reguardless of the + // CAS schema. + + // There is no work to do if importing from self. + if (this == &Upstream) + return Other; + + /// Keeps track of the state of visitation for current node and all of its + /// parents. Upstream Cursor holds information only from upstream CAS. + struct UpstreamCursor { + ObjectRef Ref; + ObjectHandle Node; + size_t RefsCount; + std::deque<ObjectRef> Refs; + }; + SmallVector<UpstreamCursor, 16> CursorStack; + /// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either + /// just stored in the CAS or nodes already exists in the current CAS. + SmallVector<ObjectRef, 128> PrimaryRefStack; + /// A map from upstream ObjectRef to current ObjectRef. + llvm::DenseMap<ObjectRef, ObjectRef> CreatedObjects; + + auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) { + unsigned NumRefs = Upstream.getNumRefs(Node); + std::deque<ObjectRef> Refs; + for (unsigned I = 0; I < NumRefs; ++I) + Refs.push_back(Upstream.readRef(Node, I)); + + CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)}); + }; + + auto UpstreamHandle = Upstream.load(Other); + if (!UpstreamHandle) + return UpstreamHandle.takeError(); + enqueueNode(Other, *UpstreamHandle); + + while (!CursorStack.empty()) { + UpstreamCursor &Cur = CursorStack.back(); + if (Cur.Refs.empty()) { + // Copy the node data into the primary store. + // The bottom of \p PrimaryRefStack contains the ObjectRef for the + // current node. + assert(PrimaryRefStack.size() >= Cur.RefsCount); + auto Refs = ArrayRef(PrimaryRefStack) + .slice(PrimaryRefStack.size() - Cur.RefsCount); + auto NewNode = store(Refs, Upstream.getData(Cur.Node)); + if (!NewNode) + return NewNode.takeError(); + + // Remove the current node and its IDs from the stack. + PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount); + CursorStack.pop_back(); + + PrimaryRefStack.push_back(*NewNode); + CreatedObjects.try_emplace(Cur.Ref, *NewNode); + continue; + } + + // Check if the node exists already. + auto CurrentID = Cur.Refs.front(); + Cur.Refs.pop_front(); + auto Ref = CreatedObjects.find(CurrentID); + if (Ref != CreatedObjects.end()) { + // If exists already, just need to enqueue the primary node. + PrimaryRefStack.push_back(Ref->second); + continue; + } + + // Load child. + auto PrimaryID = Upstream.load(CurrentID); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + + enqueueNode(CurrentID, *PrimaryID); + } + + assert(PrimaryRefStack.size() == 1); + return PrimaryRefStack.front(); +} + std::unique_ptr<MemoryBuffer> ObjectProxy::getMemoryBuffer(StringRef Name, bool RequiresNullTerminator) const { diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp new file mode 100644 index 0000000000000..7d29f4499211e --- /dev/null +++ b/llvm/lib/CAS/OnDiskCAS.cpp @@ -0,0 +1,211 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/CAS/BuiltinCASContext.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Error.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +namespace { + +class OnDiskCAS : public BuiltinCAS { +public: + Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) final; + + Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final; + + CASID getID(ObjectRef Ref) const final; + + std::optional<ObjectRef> getReference(const CASID &ID) const final; + + Expected<bool> isMaterialized(ObjectRef Ref) const final; + + ArrayRef<char> getDataConst(ObjectHandle Node) const final; + + void print(raw_ostream &OS) const final; + Error validate(bool CheckHash) const final; + + static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path); + + OnDiskCAS(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) + : UnifiedDB(std::move(UniDB)), DB(&UnifiedDB->getGraphDB()) {} + +private: + ObjectHandle convertHandle(ondisk::ObjectHandle Node) const { + return makeObjectHandle(Node.getOpaqueData()); + } + + ondisk::ObjectHandle convertHandle(ObjectHandle Node) const { + return ondisk::ObjectHandle(Node.getInternalRef(*this)); + } + + ObjectRef convertRef(ondisk::ObjectID Ref) const { + return makeObjectRef(Ref.getOpaqueData()); + } + + ondisk::ObjectID convertRef(ObjectRef Ref) const { + return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this)); + } + + size_t getNumRefs(ObjectHandle Node) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return std::distance(RefsRange.begin(), RefsRange.end()); + } + + ObjectRef readRef(ObjectHandle Node, size_t I) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return convertRef(RefsRange.begin()[I]); + } + + Error forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const final; + + Error setSizeLimit(std::optional<uint64_t> SizeLimit) final; + Expected<std::optional<uint64_t>> getStorageSize() const final; + Error pruneStorageData() final; + + OnDiskCAS(std::unique_ptr<ondisk::OnDiskGraphDB> GraphDB) + : OwnedDB(std::move(GraphDB)), DB(OwnedDB.get()) {} + + std::unique_ptr<ondisk::OnDiskGraphDB> OwnedDB; + std::shared_ptr<ondisk::UnifiedOnDiskCache> UnifiedDB; + ondisk::OnDiskGraphDB *DB; +}; + +} // end anonymous namespace + +void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); } +Error OnDiskCAS::validate(bool CheckHash) const { + auto Hasher = [](ArrayRef<ArrayRef<uint8_t>> Refs, ArrayRef<char> Data, + SmallVectorImpl<uint8_t> &Result) { + auto Hash = BuiltinObjectHasher<llvm::cas::builtin::HasherT>::hashObject( + Refs, Data); + Result.assign(Hash.begin(), Hash.end()); + }; + + if (auto E = DB->validate(CheckHash, Hasher)) + return E; + + return Error::success(); +} + +CASID OnDiskCAS::getID(ObjectRef Ref) const { + ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref)); + return CASID::create(&getContext(), toStringRef(Hash)); +} + +std::optional<ObjectRef> OnDiskCAS::getReference(const CASID &ID) const { + std::optional<ondisk::ObjectID> ObjID = + DB->getExistingReference(ID.getHash()); + if (!ObjID) + return std::nullopt; + return convertRef(*ObjID); +} + +Expected<bool> OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const { + return DB->isMaterialized(convertRef(ExternalRef)); +} + +ArrayRef<char> OnDiskCAS::getDataConst(ObjectHandle Node) const { + return DB->getObjectData(convertHandle(Node)); +} + +Expected<std::optional<ObjectHandle>> +OnDiskCAS::loadIfExists(ObjectRef ExternalRef) { + Expected<std::optional<ondisk::ObjectHandle>> ObjHnd = + DB->load(convertRef(ExternalRef)); + if (!ObjHnd) + return ObjHnd.takeError(); + if (!*ObjHnd) + return std::nullopt; + return convertHandle(**ObjHnd); +} + +Expected<ObjectRef> OnDiskCAS::storeImpl(ArrayRef<uint8_t> ComputedHash, + ArrayRef<ObjectRef> Refs, + ArrayRef<char> Data) { + SmallVector<ondisk::ObjectID, 64> IDs; + IDs.reserve(Refs.size()); + for (ObjectRef Ref : Refs) { + IDs.push_back(convertRef(Ref)); + } + + auto StoredID = DB->getReference(ComputedHash); + if (LLVM_UNLIKELY(!StoredID)) + return StoredID.takeError(); + if (Error E = DB->store(*StoredID, IDs, Data)) + return std::move(E); + return convertRef(*StoredID); +} + +Error OnDiskCAS::forEachRef(ObjectHandle Node, + function_ref<Error(ObjectRef)> Callback) const { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + for (ondisk::ObjectID Ref : RefsRange) { + if (Error E = Callback(convertRef(Ref))) + return E; + } + return Error::success(); +} + +Error OnDiskCAS::setSizeLimit(std::optional<uint64_t> SizeLimit) { + UnifiedDB->setSizeLimit(SizeLimit); + return Error::success(); +} + +Expected<std::optional<uint64_t>> OnDiskCAS::getStorageSize() const { + return UnifiedDB->getStorageSize(); +} + +Error OnDiskCAS::pruneStorageData() { return UnifiedDB->collectGarbage(); } + +Expected<std::unique_ptr<OnDiskCAS>> OnDiskCAS::open(StringRef AbsPath) { + Expected<std::unique_ptr<ondisk::OnDiskGraphDB>> DB = + ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(), + sizeof(HashType)); + if (!DB) + return DB.takeError(); + return std::unique_ptr<OnDiskCAS>(new OnDiskCAS(std::move(*DB))); +} + +bool cas::isOnDiskCASEnabled() { +#if LLVM_ENABLE_ONDISK_CAS + return true; +#else + return false; +#endif +} + +Expected<std::unique_ptr<ObjectStore>> cas::createOnDiskCAS(const Twine &Path) { +#if LLVM_ENABLE_ONDISK_CAS + // FIXME: An absolute path isn't really good enough. Should open a directory + // and use openat() for files underneath. + SmallString<256> AbsPath; + Path.toVector(AbsPath); + sys::fs::make_absolute(AbsPath); + + return OnDiskCAS::open(AbsPath); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled"); +#endif /* LLVM_ENABLE_ONDISK_CAS */ +} + +std::unique_ptr<ObjectStore> +cas::builtin::createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) { + return std::make_unique<OnDiskCAS>(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp index 64cbe9dc8e159..245b6fb832549 100644 --- a/llvm/lib/CAS/OnDiskGraphDB.cpp +++ b/llvm/lib/CAS/OnDiskGraphDB.cpp @@ -893,6 +893,10 @@ int64_t DataRecordHandle::getDataRelOffset() const { } Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const { + if (UpstreamDB) { + if (auto E = UpstreamDB->validate(Deep, Hasher)) + return E; + } return Index.validate([&](FileOffset Offset, OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error { @@ -1202,11 +1206,8 @@ OnDiskGraphDB::load(ObjectID ExternalRef) { return I.takeError(); TrieRecord::Data Object = I->Ref.load(); - if (Object.SK == TrieRecord::StorageKind::Unknown) { - if (!UpstreamDB) - return std::nullopt; + if (Object.SK == TrieRecord::StorageKind::Unknown) return faultInFromUpstream(ExternalRef); - } if (Object.SK == TrieRecord::StorageKind::DataPool) return ObjectHandle::fromFileOffset(Object.Offset); @@ -1286,8 +1287,10 @@ OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef, TrieRecord::Data Object = I->Ref.load(); if (Object.SK != TrieRecord::StorageKind::Unknown) return ObjectPresence::InPrimaryDB; + if (!CheckUpstream || !UpstreamDB) return ObjectPresence::Missing; + std::optional<ObjectID> UpstreamID = UpstreamDB->getExistingReference(getDigest(*I)); return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB @@ -1549,9 +1552,10 @@ unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const { return std::max(IndexPercent, DataPercent); } -Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open( - StringRef AbsPath, StringRef HashName, unsigned HashByteSize, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) { +Expected<std::unique_ptr<OnDiskGraphDB>> +OnDiskGraphDB::open(StringRef AbsPath, StringRef HashName, + unsigned HashByteSize, OnDiskGraphDB *UpstreamDB, + FaultInPolicy Policy) { if (std::error_code EC = sys::fs::create_directories(AbsPath)) return createFileError(AbsPath, EC); @@ -1604,18 +1608,15 @@ Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open( "unexpected user header in '" + DataPoolPath + "'"); - return std::unique_ptr<OnDiskGraphDB>( - new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool), - std::move(UpstreamDB), Policy)); + return std::unique_ptr<OnDiskGraphDB>(new OnDiskGraphDB( + AbsPath, std::move(*Index), std::move(*DataPool), UpstreamDB, Policy)); } OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index, OnDiskDataAllocator DataPool, - std::unique_ptr<OnDiskGraphDB> UpstreamDB, - FaultInPolicy Policy) + OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy) : Index(std::move(Index)), DataPool(std::move(DataPool)), - RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)), - FIPolicy(Policy) { + RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy) { /// Lifetime for "big" objects not in DataPool. /// /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something @@ -1638,7 +1639,6 @@ Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID, // against the process dying during importing and leaving the database with an // incomplete tree. Note that if the upstream has missing nodes then the tree // will be copied with missing nodes as well, it won't be considered an error. - struct UpstreamCursor { ObjectHandle Node; size_t RefsCount; @@ -1720,7 +1720,6 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID, // Copy the node data into the primary store. // FIXME: Use hard-link or cloning if the file-system supports it and data is // stored into a separate file. - auto Data = UpstreamDB->getObjectData(UpstreamNode); auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode); SmallVector<ObjectID, 64> Refs; @@ -1737,7 +1736,8 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID, Expected<std::optional<ObjectHandle>> OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) { - assert(UpstreamDB); + if (!UpstreamDB) + return std::nullopt; auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID)); if (LLVM_UNLIKELY(!UpstreamID)) diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp index 21860717da3bf..15656cb38a5e5 100644 --- a/llvm/lib/CAS/OnDiskKeyValueDB.cpp +++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp @@ -20,6 +20,7 @@ #include "llvm/CAS/OnDiskKeyValueDB.h" #include "OnDiskCommon.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/Alignment.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Errc.h" @@ -53,15 +54,21 @@ Expected<std::optional<ArrayRef<char>>> OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) { // Check the result cache. OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key); - if (!ActionP) + if (ActionP) { + assert(isAddrAligned(Align(8), ActionP->Data.data())); + return ActionP->Data; + } + if (!UnifiedCache || !UnifiedCache->UpstreamKVDB) return std::nullopt; - assert(isAddrAligned(Align(8), ActionP->Data.data())); - return ActionP->Data; + + // Try to fault in from upstream. + return UnifiedCache->faultInFromUpstreamKV(Key); } Expected<std::unique_ptr<OnDiskKeyValueDB>> OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, - StringRef ValueName, size_t ValueSize) { + StringRef ValueName, size_t ValueSize, + UnifiedOnDiskCache *Cache) { if (std::error_code EC = sys::fs::create_directories(Path)) return createFileError(Path, EC); @@ -87,10 +94,14 @@ OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize, return std::move(E); return std::unique_ptr<OnDiskKeyValueDB>( - new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache))); + new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache), Cache)); } Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const { + if (UnifiedCache && UnifiedCache->UpstreamKVDB) { + if (auto E = UnifiedCache->UpstreamKVDB->validate(CheckValue)) + return E; + } return Cache.validate( [&](FileOffset Offset, OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error { diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp new file mode 100644 index 0000000000000..ae9d818241f4b --- /dev/null +++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp @@ -0,0 +1,613 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one +/// directory while also restricting storage growth with a scheme of chaining +/// the two most recent directories (primary & upstream), where the primary +/// "faults-in" data from the upstream one. When the primary (most recent) +/// directory exceeds its intended limit a new empty directory becomes the +/// primary one. +/// +/// Within the top-level directory (the path that \p UnifiedOnDiskCache::open +/// receives) there are directories named like this: +/// +/// 'v<version>.<x>' +/// 'v<version>.<x+1>' +/// 'v<version>.<x+2>' +/// ... +/// +/// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and +/// the part after the dot is an increasing integer. The primary directory is +/// the one with the highest integer and the upstream one is the directory +/// before it. For example, if the sub-directories contained are: +/// +/// 'v1.5', 'v1.6', 'v1.7', 'v1.8' +/// +/// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are +/// unused directories that can be safely deleted at any time and by any +/// process. +/// +/// Contained within the top-level directory is a file named "lock" which is +/// used for processes to take shared or exclusive locks for the contents of the +/// top directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock +/// for the top-level directory; when it closes, if the primary sub-directory +/// exceeded its limit, it attempts to get an exclusive lock in order to create +/// a new empty primary directory; if it can't get the exclusive lock it gives +/// up and lets the next \p UnifiedOnDiskCache instance that closes to attempt +/// again. +/// +/// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a +/// directory, by any process, the storage size in that directory will keep +/// growing unrestricted. But the major benefit is that garbage-collection can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers in the same process or other +/// processes. +/// +/// The \c UnifiedOnDiskCache also provides validation and recovery on top of +/// the underlying on-disk storage. The low-level storage is designed to remain +/// coherent across regular process crashes, but may be invalid after power loss +/// or similar system failures. \c UnifiedOnDiskCache::validateIfNeeded allows +/// validating the contents once per boot and can recover by marking invalid +/// data for garbage collection. +/// +/// The data recovery described above requires exclusive access to the CAS, and +/// it is an error to attempt recovery if the CAS is open in any process/thread. +/// In order to maximize backwards compatibility with tools that do not perform +/// validation before opening the CAS, we do not attempt to get exclusive access +/// until recovery is actually performed, meaning as long as the data is valid +/// it will not conflict with concurrent use. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "BuiltinCAS.h" +#include "OnDiskCommon.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/raw_ostream.h" +#include <optional> + +#if __has_include(<sys/sysctl.h>) +#include <sys/sysctl.h> +#endif + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out +/// how to handle the leftover sub-directories of the previous version, within +/// the \p UnifiedOnDiskCache::collectGarbage function. +static constexpr StringLiteral DBDirPrefix = "v1."; + +static constexpr StringLiteral ValidationFilename = "v1.validation"; +static constexpr StringLiteral CorruptPrefix = "corrupt."; + +ObjectID UnifiedOnDiskCache::getObjectIDFromValue(ArrayRef<char> Value) { + // little endian encoded. + assert(Value.size() == sizeof(uint64_t)); + return ObjectID::fromOpaqueData(support::endian::read64le(Value.data())); +} + +UnifiedOnDiskCache::ValueBytes +UnifiedOnDiskCache::getValueFromObjectID(ObjectID ID) { + // little endian encoded. + UnifiedOnDiskCache::ValueBytes ValBytes; + static_assert(ValBytes.size() == sizeof(ID.getOpaqueData())); + support::endian::write64le(ValBytes.data(), ID.getOpaqueData()); + return ValBytes; +} + +Expected<std::optional<ArrayRef<char>>> +UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef<uint8_t> Key) { + assert(UpstreamGraphDB); + assert(UpstreamKVDB); + + std::optional<ArrayRef<char>> UpstreamValue; + if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue)) + return std::move(E); + if (!UpstreamValue) + return std::nullopt; + + // The value is the \p ObjectID in the context of the upstream + // \p OnDiskGraphDB instance. Translate it to the context of the primary + // \p OnDiskGraphDB instance. + ObjectID UpstreamID = getObjectIDFromValue(*UpstreamValue); + auto PrimaryID = + PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID)); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + return PrimaryKVDB->put(Key, getValueFromObjectID(*PrimaryID)); +} + +/// \returns all the 'v<version>.<x>' names of sub-directories, sorted with +/// ascending order of the integer after the dot. Corrupt directories, if +/// included, will come first. +static Expected<SmallVector<std::string, 4>> +getAllDBDirs(StringRef Path, bool IncludeCorrupt = false) { + struct DBDir { + uint64_t Order; + std::string Name; + }; + SmallVector<DBDir> FoundDBDirs; + + std::error_code EC; + for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; + DirI.increment(EC)) { + if (DirI->type() != sys::fs::file_type::directory_file) + continue; + StringRef SubDir = sys::path::filename(DirI->path()); + if (IncludeCorrupt && SubDir.starts_with(CorruptPrefix)) { + FoundDBDirs.push_back({0, std::string(SubDir)}); + continue; + } + if (!SubDir.starts_with(DBDirPrefix)) + continue; + uint64_t Order; + if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order)) + return createStringError(inconvertibleErrorCode(), + "unexpected directory " + DirI->path()); + FoundDBDirs.push_back({Order, std::string(SubDir)}); + } + if (EC) + return createFileError(Path, EC); + + llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool { + return LHS.Order <= RHS.Order; + }); + + SmallVector<std::string, 4> DBDirs; + for (DBDir &Dir : FoundDBDirs) + DBDirs.push_back(std::move(Dir.Name)); + return DBDirs; +} + +static Expected<SmallVector<std::string, 4>> getAllGarbageDirs(StringRef Path) { + auto DBDirs = getAllDBDirs(Path, /*IncludeCorrupt=*/true); + if (!DBDirs) + return DBDirs.takeError(); + + // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure + // out how to handle the leftover sub-directories of the previous version. + + for (unsigned Keep = 2; Keep > 0 && !DBDirs->empty(); --Keep) { + StringRef Back(DBDirs->back()); + if (Back.starts_with(CorruptPrefix)) + break; + DBDirs->pop_back(); + } + return *DBDirs; +} + +/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the +/// 'v<version>.<x+1>' name. +static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) { + assert(DBDir.starts_with(DBDirPrefix)); + uint64_t Count; + bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count); + assert(!Failed); + (void)Failed; + OS << DBDirPrefix << Count + 1; +} + +static Error validateOutOfProcess(StringRef LLVMCasBinary, StringRef RootPath, + bool CheckHash) { + SmallVector<StringRef> Args{LLVMCasBinary, "-cas", RootPath, "-validate"}; + if (CheckHash) + Args.push_back("-check-hash"); + + llvm::SmallString<128> StdErrPath; + int StdErrFD = -1; + if (std::error_code EC = sys::fs::createTemporaryFile( + "llvm-cas-validate-stderr", "txt", StdErrFD, StdErrPath, + llvm::sys::fs::OF_Text)) + return createStringError(EC, "failed to create temporary file"); + FileRemover OutputRemover(StdErrPath.c_str()); + + std::optional<llvm::StringRef> Redirects[] = { + {""}, // stdin = /dev/null + {""}, // stdout = /dev/null + StdErrPath.str(), + }; + + std::string ErrMsg; + int Result = + sys::ExecuteAndWait(LLVMCasBinary, Args, /*Env=*/std::nullopt, Redirects, + /*SecondsToWait=*/120, /*MemoryLimit=*/0, &ErrMsg); + + if (Result == -1) + return createStringError("failed to exec " + join(Args, " ") + ": " + + ErrMsg); + if (Result != 0) { + llvm::SmallString<64> Err("cas contents invalid"); + if (!ErrMsg.empty()) { + Err += ": "; + Err += ErrMsg; + } + auto StdErrBuf = MemoryBuffer::getFile(StdErrPath.c_str()); + if (StdErrBuf && !(*StdErrBuf)->getBuffer().empty()) { + Err += ": "; + Err += (*StdErrBuf)->getBuffer(); + } + return createStringError(Err); + } + return Error::success(); +} + +static Error validateInProcess(StringRef RootPath, StringRef HashName, + unsigned HashByteSize, bool CheckHash) { + std::shared_ptr<UnifiedOnDiskCache> UniDB; + if (Error E = UnifiedOnDiskCache::open(RootPath, std::nullopt, HashName, + HashByteSize) + .moveInto(UniDB)) + return E; + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + if (Error E = CAS->validate(CheckHash)) + return E; + auto Cache = builtin::createActionCacheFromUnifiedOnDiskCache(UniDB); + if (Error E = Cache->validate()) + return E; + return Error::success(); +} + +static Expected<uint64_t> getBootTime() { +#if __has_include(<sys/sysctl.h>) && defined(KERN_BOOTTIME) + struct timeval TV; + size_t TVLen = sizeof(TV); + int KernBoot[2] = {CTL_KERN, KERN_BOOTTIME}; + if (sysctl(KernBoot, 2, &TV, &TVLen, nullptr, 0) < 0) + return createStringError(llvm::errnoAsErrorCode(), + "failed to get boottime"); + if (TVLen != sizeof(TV)) + return createStringError("sysctl kern.boottime unexpected format"); + return TV.tv_sec; +#elif defined(__linux__) + // Use the mtime for /proc, which is recreated during system boot. + // We could also read /proc/stat and search for 'btime'. + sys::fs::file_status Status; + if (std::error_code EC = sys::fs::status("/proc", Status)) + return createFileError("/proc", EC); + return Status.getLastModificationTime().time_since_epoch().count(); +#else + llvm::report_fatal_error("getBootTime unimplemented"); +#endif +} + +Expected<ValidationResult> UnifiedOnDiskCache::validateIfNeeded( + StringRef RootPath, StringRef HashName, unsigned HashByteSize, + bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional<StringRef> LLVMCasBinaryPath) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, ValidationFilename); + int FD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(FD != -1); + + sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD); + auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(File); }); + + if (std::error_code EC = lockFileThreadSafe(FD, sys::fs::LockKind::Exclusive)) + return createFileError(PathBuf, EC); + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(FD); }); + + SmallString<8> Bytes; + if (Error E = sys::fs::readNativeFileToEOF(File, Bytes)) + return createFileError(PathBuf, std::move(E)); + + uint64_t ValidationBootTime = 0; + if (!Bytes.empty() && + StringRef(Bytes).trim().getAsInteger(10, ValidationBootTime)) + return createFileError(PathBuf, errc::illegal_byte_sequence, + "expected integer"); + + static uint64_t BootTime = 0; + if (BootTime == 0) + if (Error E = getBootTime().moveInto(BootTime)) + return std::move(E); + + std::string LogValidationError; + + if (ValidationBootTime == BootTime && !ForceValidation) + return ValidationResult::Skipped; + + // Validate! + bool NeedsRecovery = false; + if (Error E = + LLVMCasBinaryPath + ? validateOutOfProcess(*LLVMCasBinaryPath, RootPath, CheckHash) + : validateInProcess(RootPath, HashName, HashByteSize, + CheckHash)) { + if (AllowRecovery) { + consumeError(std::move(E)); + NeedsRecovery = true; + } else { + return std::move(E); + } + } + + if (NeedsRecovery) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, "lock"); + + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + auto CloseLock = make_scope_exit([&]() { sys::fs::closeFile(LockFile); }); + if (std::error_code EC = tryLockFileThreadSafe(LockFD)) { + if (EC == std::errc::no_lock_available) + return createFileError( + PathBuf, EC, + "CAS validation requires exclusive access but CAS was in use"); + return createFileError(PathBuf, EC); + } + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + auto DBDirs = getAllDBDirs(RootPath); + if (!DBDirs) + return DBDirs.takeError(); + + for (StringRef DBDir : *DBDirs) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, DBDir); + std::error_code EC; + int Attempt = 0, MaxAttempts = 100; + SmallString<128> GCPath; + for (; Attempt < MaxAttempts; ++Attempt) { + GCPath.assign(RootPath); + sys::path::append(GCPath, CorruptPrefix + std::to_string(Attempt) + + "." + DBDir); + EC = sys::fs::rename(PathBuf, GCPath); + // Darwin uses ENOTEMPTY. Linux may return either ENOTEMPTY or EEXIST. + if (EC != errc::directory_not_empty && EC != errc::file_exists) + break; + } + if (Attempt == MaxAttempts) + return createStringError( + EC, "rename " + PathBuf + + " failed: too many CAS directories awaiting pruning"); + if (EC) + return createStringError(EC, "rename " + PathBuf + " to " + GCPath + + " failed: " + EC.message()); + } + } + + if (ValidationBootTime != BootTime) { + // Fix filename in case we have error to report. + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, ValidationFilename); + if (std::error_code EC = sys::fs::resize_file(FD, 0)) + return createFileError(PathBuf, EC); + raw_fd_ostream OS(FD, /*shouldClose=*/false); + OS.seek(0); // resize does not reset position + OS << BootTime << '\n'; + if (OS.has_error()) + return createFileError(PathBuf, OS.error()); + } + + return NeedsRecovery ? ValidationResult::Recovered : ValidationResult::Valid; +} + +Expected<std::unique_ptr<UnifiedOnDiskCache>> +UnifiedOnDiskCache::open(StringRef RootPath, std::optional<uint64_t> SizeLimit, + StringRef HashName, unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, "lock"); + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(LockFD != -1); + // Locking the directory using shared lock, which will prevent other processes + // from creating a new chain (essentially while a \p UnifiedOnDiskCache + // instance holds a shared lock the storage for the primary directory will + // grow unrestricted). + if (std::error_code EC = + lockFileThreadSafe(LockFD, sys::fs::LockKind::Shared)) + return createFileError(PathBuf, EC); + + auto DBDirs = getAllDBDirs(RootPath); + if (!DBDirs) + return DBDirs.takeError(); + if (DBDirs->empty()) + DBDirs->push_back((Twine(DBDirPrefix) + "1").str()); + + assert(!DBDirs->empty()); + + /// If there is only one directory open databases on it. If there are 2 or + /// more directories, get the most recent directories and chain them, with the + /// most recent being the primary one. The remaining directories are unused + /// data than can be garbage-collected. + auto UniDB = std::unique_ptr<UnifiedOnDiskCache>(new UnifiedOnDiskCache()); + std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB; + std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB; + if (DBDirs->size() > 1) { + StringRef UpstreamDir = *(DBDirs->end() - 2); + PathBuf = RootPath; + sys::path::append(PathBuf, UpstreamDir); + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + /*UpstreamDB=*/nullptr, FaultInPolicy) + .moveInto(UpstreamGraphDB)) + return std::move(E); + if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t)) + .moveInto(UpstreamKVDB)) + return std::move(E); + } + + StringRef PrimaryDir = *(DBDirs->end() - 1); + PathBuf = RootPath; + sys::path::append(PathBuf, PrimaryDir); + std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB; + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + UpstreamGraphDB.get(), FaultInPolicy) + .moveInto(PrimaryGraphDB)) + return std::move(E); + std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB; + // \p UnifiedOnDiskCache does manual chaining for key-value requests, + // including an extra translation step of the value during fault-in. + if (Error E = + OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t), UniDB.get()) + .moveInto(PrimaryKVDB)) + return std::move(E); + + UniDB->RootPath = RootPath; + UniDB->SizeLimit = SizeLimit.value_or(0); + UniDB->LockFD = LockFD; + UniDB->NeedsGarbageCollection = DBDirs->size() > 2; + UniDB->PrimaryDBDir = PrimaryDir; + UniDB->UpstreamGraphDB = std::move(UpstreamGraphDB); + UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB); + UniDB->UpstreamKVDB = std::move(UpstreamKVDB); + UniDB->PrimaryKVDB = std::move(PrimaryKVDB); + + return std::move(UniDB); +} + +void UnifiedOnDiskCache::setSizeLimit(std::optional<uint64_t> SizeLimit) { + this->SizeLimit = SizeLimit.value_or(0); +} + +uint64_t UnifiedOnDiskCache::getStorageSize() const { + uint64_t TotalSize = getPrimaryStorageSize(); + if (UpstreamGraphDB) + TotalSize += UpstreamGraphDB->getStorageSize(); + if (UpstreamKVDB) + TotalSize += UpstreamKVDB->getStorageSize(); + return TotalSize; +} + +uint64_t UnifiedOnDiskCache::getPrimaryStorageSize() const { + return PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize(); +} + +bool UnifiedOnDiskCache::hasExceededSizeLimit() const { + uint64_t CurSizeLimit = SizeLimit; + if (!CurSizeLimit) + return false; + + // If the hard limit is beyond 85%, declare above limit and request clean up. + unsigned CurrentPercent = + std::max(PrimaryGraphDB->getHardStorageLimitUtilization(), + PrimaryKVDB->getHardStorageLimitUtilization()); + if (CurrentPercent > 85) + return true; + + // We allow each of the directories in the chain to reach up to half the + // intended size limit. Check whether the primary directory has exceeded half + // the limit or not, in order to decide whether we need to start a new chain. + // + // We could check the size limit against the sum of sizes of both the primary + // and upstream directories but then if the upstream is significantly larger + // than the intended limit, it would trigger a new chain to be created before + // the primary has reached its own limit. Essentially in such situation we + // prefer reclaiming the storage later in order to have more consistent cache + // hits behavior. + return (CurSizeLimit / 2) < getPrimaryStorageSize(); +} + +Error UnifiedOnDiskCache::close(bool CheckSizeLimit) { + if (LockFD == -1) + return Error::success(); // already closed. + auto CloseLock = make_scope_exit([&]() { + assert(LockFD >= 0); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + sys::fs::closeFile(LockFile); + LockFD = -1; + }); + + bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false; + UpstreamKVDB.reset(); + PrimaryKVDB.reset(); + UpstreamGraphDB.reset(); + PrimaryGraphDB.reset(); + if (std::error_code EC = unlockFileThreadSafe(LockFD)) + return createFileError(RootPath, EC); + + if (!ExceededSizeLimit) + return Error::success(); + + // The primary directory exceeded its intended size limit. Try to get an + // exclusive lock in order to create a new primary directory for next time + // this \p UnifiedOnDiskCache path is opened. + + if (std::error_code EC = tryLockFileThreadSafe( + LockFD, std::chrono::milliseconds(0), sys::fs::LockKind::Exclusive)) { + if (EC == errc::no_lock_available) + return Error::success(); // couldn't get exclusive lock, give up. + return createFileError(RootPath, EC); + } + auto UnlockFile = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + // Managed to get an exclusive lock which means there are no other open + // \p UnifiedOnDiskCache instances for the same path, so we can safely start a + // new primary directory. To start a new primary directory we just have to + // create a new empty directory with the next consecutive index; since this is + // an atomic operation we will leave the top-level directory in a consistent + // state even if the process dies during this code-path. + + SmallString<256> PathBuf(RootPath); + raw_svector_ostream OS(PathBuf); + OS << sys::path::get_separator(); + getNextDBDirName(PrimaryDBDir, OS); + if (std::error_code EC = sys::fs::create_directory(PathBuf)) + return createFileError(PathBuf, EC); + + NeedsGarbageCollection = true; + return Error::success(); +} + +UnifiedOnDiskCache::UnifiedOnDiskCache() = default; + +UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); } + +Error UnifiedOnDiskCache::collectGarbage(StringRef Path) { + auto DBDirs = getAllGarbageDirs(Path); + if (!DBDirs) + return DBDirs.takeError(); + + SmallString<256> PathBuf(Path); + for (StringRef UnusedSubDir : *DBDirs) { + sys::path::append(PathBuf, UnusedSubDir); + if (std::error_code EC = sys::fs::remove_directories(PathBuf)) + return createFileError(PathBuf, EC); + sys::path::remove_filename(PathBuf); + } + return Error::success(); +} + +Error UnifiedOnDiskCache::collectGarbage() { return collectGarbage(RootPath); } diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 53f1cfe24a68d..6412949948c07 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -38,6 +38,7 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/MemoryModelRelaxationAnnotations.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -1259,8 +1260,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop( BasicBlock *BB = Builder.GetInsertBlock(); Function *F = BB->getParent(); - assert(AddrAlign >= - F->getDataLayout().getTypeStoreSize(ResultTy) && + assert(AddrAlign >= F->getDataLayout().getTypeStoreSize(ResultTy) && "Expected at least natural alignment at this point."); // Given: atomicrmw some_op iN* %addr, iN %incr ordering @@ -1295,7 +1295,13 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop( TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); Value *TryAgain = Builder.CreateICmpNE( StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain"); - Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); + + Instruction *CondBr = Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); + + // Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is + // hard to predict precise branch weigths we mark the branch as "unknown" + // (50/50) to prevent misleading optimizations. + setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, *F, DEBUG_TYPE); Builder.SetInsertPoint(ExitBB, ExitBB->begin()); return Loaded; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 1fc90d0852aad..4fd220481cedb 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -294,6 +294,10 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) { MachinePreds[Edge].push_back(NewPred); } +static bool targetSupportsBF16Type(const MachineFunction *MF) { + return MF->getTarget().getTargetTriple().isSPIRV(); +} + static bool containsBF16Type(const User &U) { // BF16 cannot currently be represented by LLT, to avoid miscompiles we // prevent any instructions using them. FIXME: This can be removed once LLT @@ -306,7 +310,7 @@ static bool containsBF16Type(const User &U) { bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; // Get or create a virtual register for each value. @@ -328,7 +332,7 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; Register Op0 = getOrCreateVReg(*U.getOperand(0)); @@ -348,7 +352,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) { bool IRTranslator::translateCompare(const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; auto *CI = cast<CmpInst>(&U); @@ -1569,7 +1573,7 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; uint32_t Flags = 0; @@ -2688,7 +2692,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, bool IRTranslator::translateInlineAsm(const CallBase &CB, MachineIRBuilder &MIRBuilder) { - if (containsBF16Type(CB)) + if (containsBF16Type(CB) && !targetSupportsBF16Type(MF)) return false; const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering(); @@ -2779,7 +2783,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB, } bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { - if (!MF->getTarget().getTargetTriple().isSPIRV() && containsBF16Type(U)) + if (containsBF16Type(U) && !targetSupportsBF16Type(MF)) return false; const CallInst &CI = cast<CallInst>(U); diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 4b4df98024f4a..637acd61c8a5f 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -109,8 +109,10 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C, if (auto *CI = dyn_cast<ConstantInt>(NumericConstant)) { if (CI->getBitWidth() > 64) MIB.addCImm(CI); - else + else if (CI->getBitWidth() == 1) MIB.addImm(CI->getZExtValue()); + else + MIB.addImm(CI->getSExtValue()); } else if (auto *CFP = dyn_cast<ConstantFP>(NumericConstant)) { MIB.addFPImm(CFP); } else if (isa<ConstantPointerNull>(NumericConstant)) { diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp index c31454a8affda..b5d3092ee84d8 100644 --- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp +++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp @@ -129,6 +129,9 @@ static bool isColdBlock(const MachineBasicBlock &MBB, } bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + // Do not split functions when -basic-block-sections=all is specified. if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All) return false; diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp index da29ffc9d2fed..88d81993fbe55 100644 --- a/llvm/lib/CodeGen/MachineInstrBundle.cpp +++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp @@ -136,6 +136,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, SmallSetVector<Register, 8> ExternUses; SmallSet<Register, 8> KilledUseSet; SmallSet<Register, 8> UndefUseSet; + SmallVector<std::pair<Register, Register>> TiedOperands; for (auto MII = FirstMI; MII != LastMI; ++MII) { // Debug instructions have no effects to track. if (MII->isDebugInstr()) @@ -161,6 +162,15 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, // External def is now killed. KilledUseSet.insert(Reg); } + if (MO.isTied() && Reg.isVirtual()) { + // Record tied operand constraints that involve virtual registers so + // that bundles that are formed pre-register allocation reflect the + // relevant constraints. + unsigned TiedIdx = MII->findTiedOperandIdx(MO.getOperandNo()); + MachineOperand &TiedMO = MII->getOperand(TiedIdx); + Register DefReg = TiedMO.getReg(); + TiedOperands.emplace_back(DefReg, Reg); + } } } @@ -203,7 +213,17 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB, bool isKill = KilledUseSet.contains(Reg); bool isUndef = UndefUseSet.contains(Reg); MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) | - getImplRegState(true)); + getImplRegState(true)); + } + + for (auto [DefReg, UseReg] : TiedOperands) { + unsigned DefIdx = + std::distance(LocalDefs.begin(), llvm::find(LocalDefs, DefReg)); + unsigned UseIdx = + std::distance(ExternUses.begin(), llvm::find(ExternUses, UseReg)); + assert(DefIdx < LocalDefs.size()); + assert(UseIdx < ExternUses.size()); + MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx); } } diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index f18c051142960..73993705c4a7b 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -2559,7 +2559,7 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) { for (unsigned i = 0; i < ResourceCount; ++i) { ReservedCyclesIndex[i] = NumUnits; NumUnits += SchedModel->getProcResource(i)->NumUnits; - if (isUnbufferedGroup(i)) { + if (isReservedGroup(i)) { auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin; for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits; U != UE; ++U) @@ -2631,7 +2631,7 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, assert(NumberOfInstances > 0 && "Cannot have zero instances of a ProcResource"); - if (isUnbufferedGroup(PIdx)) { + if (isReservedGroup(PIdx)) { // If any subunits are used by the instruction, report that the // subunits of the resource group are available at the first cycle // in which the unit is available, effectively removing the group diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index c0710c467a2e6..fdf10480b6e05 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -2584,6 +2584,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { report("Extra explicit operand on non-variadic instruction", MO, MONum); } + // Verify earlyClobber def operand + if (MCID.getOperandConstraint(MONum, MCOI::EARLY_CLOBBER) != -1) { + if (!MO->isReg()) + report("Early clobber must be a register", MI); + if (!MO->isEarlyClobber()) + report("Missing earlyClobber flag", MI); + } + switch (MO->getType()) { case MachineOperand::MO_Register: { // Verify debug flag on debug instructions. Check this first because reg0 diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 38f6deb39ddf3..99f76936a180f 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -1600,6 +1600,22 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, SlotIndex DefIndex = CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber()); VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator(); + + // Refine the subranges that are now defined by the remat. + // This will split existing subranges if necessary. + DstInt.refineSubRanges( + Alloc, DstMask, + [&DefIndex, &Alloc](LiveInterval::SubRange &SR) { + // We know that this lane is defined by this instruction, + // but at this point it might not be live because it was not defined + // by the original instruction. This happens when the + // rematerialization widens the defined register. Assign that lane a + // dead def so that the interferences are properly modeled. + if (!SR.liveAt(DefIndex)) + SR.createDeadDef(DefIndex, Alloc); + }, + *LIS->getSlotIndexes(), *TRI); + for (LiveInterval::SubRange &SR : DstInt.subranges()) { if ((SR.LaneMask & DstMask).none()) { LLVM_DEBUG(dbgs() @@ -1617,14 +1633,6 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, // updateRegDefUses. The original subrange def may have only undefed // some lanes. UpdatedSubRanges = true; - } else { - // We know that this lane is defined by this instruction, - // but at this point it might not be live because it was not defined - // by the original instruction. This happens when the - // rematerialization widens the defined register. Assign that lane a - // dead def so that the interferences are properly modeled. - if (!SR.liveAt(DefIndex)) - SR.createDeadDef(DefIndex, Alloc); } } if (UpdatedSubRanges) diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index bb10cf687db8d..d84c3fb05bb24 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -733,6 +733,8 @@ MachineOperand GetMOForConstDbgOp(const SDDbgOperand &Op) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) { if (CI->getBitWidth() > 64) return MachineOperand::CreateCImm(CI); + if (CI->getBitWidth() == 1) + return MachineOperand::CreateImm(CI->getZExtValue()); return MachineOperand::CreateImm(CI->getSExtValue()); } if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 431a81002074f..316aacdf6978e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -163,6 +163,8 @@ class SelectionDAGLegalize { RTLIB::Libcall CallI128); void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results); + SDValue ExpandSincosStretLibCall(SDNode *Node) const; + SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, const SDLoc &dl); SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, @@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) { return false; } +SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const { + // For iOS, we want to call an alternative entry point: __sincos_stret, + // which returns the values in two S / D registers. + SDLoc dl(Node); + SDValue Arg = Node->getOperand(0); + EVT ArgVT = Arg.getValueType(); + RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); + RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC); + if (SincosStret == RTLIB::Unsupported) + return SDValue(); + + /// There are 3 different ABI cases to handle: + /// - Direct return of separate fields in registers + /// - Single return as vector elements + /// - sret struct + + const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo(); + + const DataLayout &DL = DAG.getDataLayout(); + + auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy( + *DAG.getContext(), TM.getTargetTriple(), DL, SincosStret); + + Type *SincosStretRetTy = FuncTy->getReturnType(); + CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret); + StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret); + + SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(), + TLI.getProgramPointerTy(DL)); + + TargetLowering::ArgListTy Args; + SDValue SRet; + + int FrameIdx; + if (FuncTy->getParamType(0)->isPointerTy()) { + // Uses sret + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + + AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0); + Type *StructTy = PtrAttrs.getStructRetType(); + const uint64_t ByteSize = DL.getTypeAllocSize(StructTy); + const Align StackAlign = DL.getPrefTypeAlign(StructTy); + + FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); + SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL)); + + TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0)); + Entry.IsSRet = true; + Entry.IndirectType = StructTy; + Entry.Alignment = StackAlign; + + Args.push_back(Entry); + Args.emplace_back(Arg, FuncTy->getParamType(1)); + } else { + Args.emplace_back(Arg, FuncTy->getParamType(0)); + } + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args)) + .setIsPostTypeLegalization(); + + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + + if (SRet) { + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); + SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo); + + TypeSize StoreSize = ArgVT.getStoreSize(); + + // Address of cos field. + SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize); + SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, + PtrInfo.getWithOffset(StoreSize)); + + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0), + LoadCos.getValue(0)); + } + + if (!CallResult.first.getValueType().isVector()) + return CallResult.first; + + SDValue SinVal = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, + DAG.getVectorIdxConstant(0, dl)); + SDValue CosVal = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, + DAG.getVectorIdxConstant(1, dl)); + SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); + return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); +} + SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const { SDLoc dl(Node); EVT VT = Node->getValueType(0); @@ -4730,6 +4827,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { case ISD::FSINCOS: case ISD::FSINCOSPI: { EVT VT = Node->getValueType(0); + + if (Node->getOpcode() == ISD::FSINCOS) { + RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT); + if (SincosStret != RTLIB::UNKNOWN_LIBCALL) { + if (SDValue Expanded = ExpandSincosStretLibCall(Node)) { + Results.push_back(Expanded); + Results.push_back(Expanded.getValue(1)); + break; + } + } + } + RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS ? RTLIB::getSINCOS(VT) : RTLIB::getSINCOSPI(VT); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index a52265055c88a..fa0c899dfcc27 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8958,9 +8958,8 @@ bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const { // Avoid emitting tail calls in functions with the disable-tail-calls // attribute. const Function *Caller = CB.getParent()->getParent(); - if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() == - "true" && - !isMustTailCall) + if (!isMustTailCall && + Caller->getFnAttribute("disable-tail-calls").getValueAsBool()) return false; // We can't tail call inside a function with a swifterror argument. Lowering diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index 6c78ef05e1b61..7496c5a084da4 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -704,7 +704,9 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent, DIDumpOptions ChildDumpOpts = DumpOpts; ChildDumpOpts.ShowParents = false; while (Child) { - Child.dump(OS, Indent + 2, ChildDumpOpts); + if (DumpOpts.FilterChildTag.empty() || + llvm::is_contained(DumpOpts.FilterChildTag, Child.getTag())) + Child.dump(OS, Indent + 2, ChildDumpOpts); Child = Child.getSibling(); } } diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 3b8fde8aff45f..cd39970f5111f 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -4171,6 +4171,16 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) { return SI.removeCase(I); } +void SwitchInstProfUpdateWrapper::replaceDefaultDest(SwitchInst::CaseIt I) { + auto *DestBlock = I->getCaseSuccessor(); + if (Weights) { + auto Weight = getSuccessorWeight(I->getCaseIndex() + 1); + (*Weights)[0] = Weight.value(); + } + + SI.setDefaultDest(DestBlock); +} + void SwitchInstProfUpdateWrapper::addCase( ConstantInt *OnVal, BasicBlock *Dest, SwitchInstProfUpdateWrapper::CaseWeightOpt W) { diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 77af29b9d70f6..2ce5719228a0d 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -7,7 +7,9 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/RuntimeLibcalls.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/StringTable.h" +#include "llvm/IR/DataLayout.h" #include "llvm/Support/Debug.h" #include "llvm/Support/xxhash.h" #include "llvm/TargetParser/ARMTargetParser.h" @@ -72,3 +74,80 @@ bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { return false; } } + +std::pair<FunctionType *, AttributeList> +RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT, + const DataLayout &DL, + RTLIB::LibcallImpl LibcallImpl) const { + static constexpr Attribute::AttrKind CommonFnAttrs[] = { + Attribute::NoCallback, Attribute::NoFree, Attribute::NoSync, + Attribute::NoUnwind, Attribute::WillReturn}; + + switch (LibcallImpl) { + case RTLIB::impl___sincos_stret: + case RTLIB::impl___sincosf_stret: { + if (!darwinHasSinCosStret(TT)) // Non-darwin currently unexpected + return {}; + + Type *ScalarTy = LibcallImpl == RTLIB::impl___sincosf_stret + ? Type::getFloatTy(Ctx) + : Type::getDoubleTy(Ctx); + + AttrBuilder FuncAttrBuilder(Ctx); + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + + const bool UseSret = + TT.isX86_32() || ((TT.isARM() || TT.isThumb()) && + ARM::computeTargetABI(TT) == ARM::ARM_ABI_APCS); + + FuncAttrBuilder.addMemoryAttr(MemoryEffects::argumentOrErrnoMemOnly( + UseSret ? ModRefInfo::Mod : ModRefInfo::NoModRef, ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + if (UseSret) { + AttrBuilder AttrBuilder(Ctx); + StructType *StructTy = StructType::get(ScalarTy, ScalarTy); + AttrBuilder.addStructRetAttr(StructTy); + AttrBuilder.addAlignmentAttr(DL.getABITypeAlign(StructTy)); + FunctionType *FuncTy = FunctionType::get( + Type::getVoidTy(Ctx), {DL.getAllocaPtrType(Ctx), ScalarTy}, false); + + return {FuncTy, Attrs.addParamAttributes(Ctx, 0, AttrBuilder)}; + } + + Type *RetTy = + LibcallImpl == RTLIB::impl___sincosf_stret && TT.isX86_64() + ? static_cast<Type *>(FixedVectorType::get(ScalarTy, 2)) + : static_cast<Type *>(StructType::get(ScalarTy, ScalarTy)); + + return {FunctionType::get(RetTy, {ScalarTy}, false), Attrs}; + } + case RTLIB::impl_sqrtf: + case RTLIB::impl_sqrt: { + AttrBuilder FuncAttrBuilder(Ctx); + + for (Attribute::AttrKind Attr : CommonFnAttrs) + FuncAttrBuilder.addAttribute(Attr); + FuncAttrBuilder.addMemoryAttr(MemoryEffects::errnoMemOnly(ModRefInfo::Mod)); + + AttributeList Attrs; + Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder); + + Type *ScalarTy = LibcallImpl == RTLIB::impl_sqrtf ? Type::getFloatTy(Ctx) + : Type::getDoubleTy(Ctx); + FunctionType *FuncTy = FunctionType::get(ScalarTy, {ScalarTy}, false); + + Attrs = Attrs.addRetAttribute( + Ctx, Attribute::getWithNoFPClass(Ctx, fcNegInf | fcNegSubnormal | + fcNegNormal)); + return {FuncTy, Attrs}; + } + default: + return {}; + } + + return {}; +} diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp index 5e3713778286f..d693ea33d8d7b 100644 --- a/llvm/lib/MC/SPIRVObjectWriter.cpp +++ b/llvm/lib/MC/SPIRVObjectWriter.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCSPIRVObjectWriter.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCValue.h" @@ -17,8 +18,10 @@ using namespace llvm; void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) { constexpr uint32_t MagicNumber = 0x07230203; constexpr uint32_t GeneratorID = 43; - constexpr uint32_t GeneratorMagicNumber = - (GeneratorID << 16) | (LLVM_VERSION_MAJOR); + const uint32_t GeneratorMagicNumber = + Asm.getContext().getTargetTriple().getVendor() == Triple::AMD + ? UINT16_MAX + : ((GeneratorID << 16) | (LLVM_VERSION_MAJOR)); constexpr uint32_t Schema = 0; W.write<uint32_t>(MagicNumber); diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp index e09dc947c2779..c2f4560c06c0d 100644 --- a/llvm/lib/Object/MachOObjectFile.cpp +++ b/llvm/lib/Object/MachOObjectFile.cpp @@ -1978,20 +1978,42 @@ uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const { return SectSize; } -ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint32_t Offset, +ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint64_t Offset, uint64_t Size) const { return arrayRefFromStringRef(getData().substr(Offset, Size)); } Expected<ArrayRef<uint8_t>> MachOObjectFile::getSectionContents(DataRefImpl Sec) const { - uint32_t Offset; + uint64_t Offset; uint64_t Size; if (is64Bit()) { MachO::section_64 Sect = getSection64(Sec); Offset = Sect.offset; Size = Sect.size; + // Check for large mach-o files where the section contents might exceed + // 4GB. MachO::section_64 objects only have 32 bit file offsets to the + // section contents and can overflow in dSYM files. We can track this and + // adjust the section offset to be 64 bit safe. If sections overflow then + // section ordering is enforced. If sections are not ordered, then an error + // will be returned stopping invalid section data from being returned. + uint64_t PrevTrueOffset = 0; + uint64_t SectOffsetAdjust = 0; + for (uint32_t SectIdx = 0; SectIdx < Sec.d.a; ++SectIdx) { + MachO::section_64 CurrSect = + getStruct<MachO::section_64>(*this, Sections[SectIdx]); + uint64_t CurrTrueOffset = (uint64_t)CurrSect.offset + SectOffsetAdjust; + if ((SectOffsetAdjust > 0) && (PrevTrueOffset > CurrTrueOffset)) + return malformedError("section data exceeds 4GB and section file " + "offsets are not ordered"); + const uint64_t EndSectFileOffset = + (uint64_t)CurrSect.offset + CurrSect.size; + if (EndSectFileOffset > UINT32_MAX) + SectOffsetAdjust += EndSectFileOffset & 0xFFFFFFFF00000000ull; + PrevTrueOffset = CurrTrueOffset; + } + Offset += SectOffsetAdjust; } else { MachO::section Sect = getSection(Sec); Offset = Sect.offset; diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index bd03ac090721c..3f41618b18fcf 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -228,7 +228,7 @@ static cl::opt<bool> EnableLoopHeaderDuplication( static cl::opt<bool> EnableDFAJumpThreading("enable-dfa-jump-thread", cl::desc("Enable DFA jump threading"), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); static cl::opt<bool> EnableHotColdSplit("hot-cold-split", diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp index 1914f4cc39d96..d859abddbcad8 100644 --- a/llvm/lib/Support/BalancedPartitioning.cpp +++ b/llvm/lib/Support/BalancedPartitioning.cpp @@ -231,7 +231,7 @@ unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes, } // Compute move gains - typedef std::pair<float, BPFunctionNode *> GainPair; + using GainPair = std::pair<float, BPFunctionNode *>; std::vector<GainPair> Gains; for (auto &N : Nodes) { bool FromLeftToRight = (N.Bucket == LeftBucket); diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index de5bd795403dc..dab8beeff7ca5 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -2343,10 +2343,10 @@ namespace { class HelpPrinter { protected: const bool ShowHidden; - typedef SmallVector<std::pair<const char *, Option *>, 128> - StrOptionPairVector; - typedef SmallVector<std::pair<const char *, SubCommand *>, 128> - StrSubCommandPairVector; + using StrOptionPairVector = + SmallVector<std::pair<const char *, Option *>, 128>; + using StrSubCommandPairVector = + SmallVector<std::pair<const char *, SubCommand *>, 128>; // Print the options. Opts is assumed to be alphabetically sorted. virtual void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) { for (const auto &Opt : Opts) diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp index 981536473d124..3bfae147d18c0 100644 --- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp +++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp @@ -47,16 +47,16 @@ class DAGDeltaAlgorithmImpl { friend class DeltaActiveSetHelper; public: - typedef DAGDeltaAlgorithm::change_ty change_ty; - typedef DAGDeltaAlgorithm::changeset_ty changeset_ty; - typedef DAGDeltaAlgorithm::changesetlist_ty changesetlist_ty; - typedef DAGDeltaAlgorithm::edge_ty edge_ty; + using change_ty = DAGDeltaAlgorithm::change_ty; + using changeset_ty = DAGDeltaAlgorithm::changeset_ty; + using changesetlist_ty = DAGDeltaAlgorithm::changesetlist_ty; + using edge_ty = DAGDeltaAlgorithm::edge_ty; private: - typedef std::vector<change_ty>::iterator pred_iterator_ty; - typedef std::vector<change_ty>::iterator succ_iterator_ty; - typedef std::set<change_ty>::iterator pred_closure_iterator_ty; - typedef std::set<change_ty>::iterator succ_closure_iterator_ty; + using pred_iterator_ty = std::vector<change_ty>::iterator; + using succ_iterator_ty = std::vector<change_ty>::iterator; + using pred_closure_iterator_ty = std::set<change_ty>::iterator; + using succ_closure_iterator_ty = std::set<change_ty>::iterator; DAGDeltaAlgorithm &DDA; diff --git a/llvm/lib/Support/DynamicLibrary.cpp b/llvm/lib/Support/DynamicLibrary.cpp index f1c15c00cedea..61566d3722419 100644 --- a/llvm/lib/Support/DynamicLibrary.cpp +++ b/llvm/lib/Support/DynamicLibrary.cpp @@ -23,7 +23,7 @@ using namespace llvm::sys; // All methods for HandleSet should be used holding SymbolsMutex. class DynamicLibrary::HandleSet { - typedef std::vector<void *> HandleList; + using HandleList = std::vector<void *>; HandleList Handles; void *Process = &Invalid; diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 708e79d39cd21..6c140be59fc4b 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -34,6 +34,31 @@ static bool isContextFalsey(const json::Value *V) { return isFalsey(*V); } +static void splitAndTrim(StringRef Str, SmallVectorImpl<StringRef> &Tokens) { + size_t CurrentPos = 0; + while (CurrentPos < Str.size()) { + // Find the next delimiter. + size_t DelimiterPos = Str.find('.', CurrentPos); + + // If no delimiter is found, process the rest of the string. + if (DelimiterPos == StringRef::npos) + DelimiterPos = Str.size(); + + // Get the current part, which may have whitespace. + StringRef Part = Str.slice(CurrentPos, DelimiterPos); + + // Manually trim the part without creating a new string object. + size_t Start = Part.find_first_not_of(" \t\r\n"); + if (Start != StringRef::npos) { + size_t End = Part.find_last_not_of(" \t\r\n"); + Tokens.push_back(Part.slice(Start, End + 1)); + } + + // Move past the delimiter for the next iteration. + CurrentPos = DelimiterPos + 1; + } +} + static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { // We split the mustache string into an accessor. // For example: @@ -46,13 +71,7 @@ static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) { // It's a literal, so it doesn't need to be saved. Tokens.push_back("."); } else { - while (!Str.empty()) { - StringRef Part; - std::tie(Part, Str) = Str.split('.'); - // Each part of the accessor needs to be saved to the arena - // to ensure it has a stable address. - Tokens.push_back(Ctx.Saver.save(Part.trim())); - } + splitAndTrim(Str, Tokens); } // Now, allocate memory for the array of StringRefs in the arena. StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size()); @@ -368,141 +387,99 @@ struct Tag { llvm_unreachable("Unknown json::Value::Kind"); } -static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open, - StringRef Close) { - const StringLiteral TripleOpen("{{{"); - const StringLiteral TripleClose("}}}"); - - size_t NormalOpenPos = Template.find(Open, StartPos); - size_t TripleOpenPos = Template.find(TripleOpen, StartPos); - - Tag Result; - - // Determine which tag comes first. - if (TripleOpenPos != StringRef::npos && - (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) { - // Found a triple mustache tag. - size_t EndPos = - Template.find(TripleClose, TripleOpenPos + TripleOpen.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Triple; - Result.StartPosition = TripleOpenPos; - size_t ContentStart = TripleOpenPos + TripleOpen.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = Template.substr( - TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos); - } else if (NormalOpenPos != StringRef::npos) { - // Found a normal mustache tag. - size_t EndPos = Template.find(Close, NormalOpenPos + Open.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Normal; - Result.StartPosition = NormalOpenPos; - size_t ContentStart = NormalOpenPos + Open.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = - Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos); - } - - return Result; -} - -static std::optional<std::pair<StringRef, StringRef>> -processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) { - LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content - << ", Kind: " << tagKindToString(T.TagKind) << "\n"); - if (T.TagKind == Tag::Kind::Triple) { - Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx); - return std::nullopt; - } - StringRef Interpolated = T.Content; - if (!Interpolated.trim().starts_with("=")) { - char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); - Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx); - return std::nullopt; - } - Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx); - StringRef DelimSpec = Interpolated.trim(); - DelimSpec = DelimSpec.drop_front(1); - DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); - DelimSpec = DelimSpec.trim(); - - std::pair<StringRef, StringRef> Ret = DelimSpec.split(' '); - LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first - << ", NewClose: " << Ret.second << "\n"); - return Ret; -} - // Simple tokenizer that splits the template into tokens. -// The mustache spec allows {{{ }}} to unescape variables, -// but we don't support that here. An unescape variable -// is represented only by {{& variable}}. static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) { LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n"); SmallVector<Token> Tokens; SmallString<8> Open("{{"); SmallString<8> Close("}}"); - size_t Start = 0; + size_t Cursor = 0; + size_t TextStart = 0; - while (Start < Template.size()) { - LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open - << "', Close:'" << Close << "'\n"); - Tag T = findNextTag(Template, Start, Open, Close); + const StringLiteral TripleOpen("{{{"); + const StringLiteral TripleClose("}}}"); - if (T.TagKind == Tag::Kind::None) { - // No more tags, the rest is text. - Tokens.emplace_back(Template.substr(Start)); - break; + while (Cursor < Template.size()) { + StringRef TemplateSuffix = Template.substr(Cursor); + StringRef TagOpen, TagClose; + Tag::Kind Kind; + + // Determine which tag we've encountered. + if (TemplateSuffix.starts_with(TripleOpen)) { + Kind = Tag::Kind::Triple; + TagOpen = TripleOpen; + TagClose = TripleClose; + } else if (TemplateSuffix.starts_with(Open)) { + Kind = Tag::Kind::Normal; + TagOpen = Open; + TagClose = Close; + } else { + // Not at a tag, continue scanning. + ++Cursor; + continue; } - // Add the text before the tag. - if (T.StartPosition > Start) { - StringRef Text = Template.substr(Start, T.StartPosition - Start); - Tokens.emplace_back(Text); + // Found a tag, first add the preceding text. + if (Cursor > TextStart) + Tokens.emplace_back(Template.slice(TextStart, Cursor)); + + // Find the closing tag. + size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size()); + if (EndPos == StringRef::npos) { + // No closing tag, the rest is text. + Tokens.emplace_back(Template.substr(Cursor)); + TextStart = Cursor = Template.size(); + break; } - if (auto NewDelims = processTag(T, Tokens, Ctx)) { - std::tie(Open, Close) = *NewDelims; + // Extract tag content and full match. + size_t ContentStart = Cursor + TagOpen.size(); + StringRef Content = Template.substr(ContentStart, EndPos - ContentStart); + StringRef FullMatch = + Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor); + + // Process the tag (inlined logic from processTag). + LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content + << ", Kind: " << tagKindToString(Kind) << "\n"); + if (Kind == Tag::Kind::Triple) { + Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx); + } else { // Normal Tag + StringRef Interpolated = Content; + if (!Interpolated.trim().starts_with("=")) { + char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); + Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx); + } else { // Set Delimiter + Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx); + StringRef DelimSpec = Interpolated.trim(); + DelimSpec = DelimSpec.drop_front(1); + DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); + DelimSpec = DelimSpec.trim(); + + auto [NewOpen, NewClose] = DelimSpec.split(' '); + LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen + << ", NewClose: " << NewClose << "\n"); + Open = NewOpen; + Close = NewClose; + } } - // Move past the tag. - Start = T.StartPosition + T.FullMatch.size(); + // Move past the tag for the next iteration. + Cursor += FullMatch.size(); + TextStart = Cursor; } - // Fix up white spaces for: - // - open sections - // - inverted sections - // - close sections - // - comments - // - // This loop attempts to find standalone tokens and tries to trim out - // the surrounding whitespace. - // For example: - // if you have the template string - // {{#section}} \n Example \n{{/section}} - // The output should would be - // For example: - // \n Example \n + // Add any remaining text after the last tag. + if (TextStart < Template.size()) + Tokens.emplace_back(Template.substr(TextStart)); + + // Fix up white spaces for standalone tags. size_t LastIdx = Tokens.size() - 1; for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) { Token &CurrentToken = Tokens[Idx]; Token::Type CurrentType = CurrentToken.getType(); - // Check if token type requires cleanup. - bool RequiresCleanUp = requiresCleanUp(CurrentType); - - if (!RequiresCleanUp) + if (!requiresCleanUp(CurrentType)) continue; - // We adjust the token body if there's no text behind or ahead. - // A token is considered to have no text ahead if the right of the previous - // token is a newline followed by spaces. - // A token is considered to have no text behind if the left of the next - // token is spaces followed by a newline. - // eg. - // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3" bool HasTextBehind = hasTextBehind(Idx, Tokens); bool HasTextAhead = hasTextAhead(Idx, Tokens); @@ -622,9 +599,16 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty, size_t Start = CurrentPtr; parseMustache(CurrentNode); const size_t End = CurrentPtr - 1; + + size_t RawBodySize = 0; + for (size_t I = Start; I < End; ++I) + RawBodySize += Tokens[I].RawBody.size(); + SmallString<128> RawBody; - for (std::size_t I = Start; I < End; I++) + RawBody.reserve(RawBodySize); + for (std::size_t I = Start; I < End; ++I) RawBody += Tokens[I].RawBody; + CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody))); Parent->addChild(CurrentNode); } diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp index 9d45096dddd97..b08f5083e00a8 100644 --- a/llvm/lib/Support/Timer.cpp +++ b/llvm/lib/Support/Timer.cpp @@ -207,7 +207,7 @@ void TimeRecord::print(const TimeRecord &Total, raw_ostream &OS) const { namespace { -typedef StringMap<Timer> Name2TimerMap; +using Name2TimerMap = StringMap<Timer>; class Name2PairMap { StringMap<std::pair<TimerGroup*, Name2TimerMap> > Map; diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index afce803f3f568..8ad20b45f5e16 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -46,12 +46,11 @@ using namespace llvm; // Context //===----------------------------------------------------------------------===// -namespace llvm::detail { /// This class represents the internal implementation of the RecordKeeper. /// It contains all of the contextual static state of the Record classes. It is /// kept out-of-line to simplify dependencies, and also make it easier for /// internal classes to access the uniquer state of the keeper. -struct RecordKeeperImpl { +struct detail::RecordKeeperImpl { RecordKeeperImpl(RecordKeeper &RK) : SharedBitRecTy(RK), SharedIntRecTy(RK), SharedStringRecTy(RK), SharedDagRecTy(RK), AnyRecord(RK, {}), TheUnsetInit(RK), @@ -99,7 +98,6 @@ struct RecordKeeperImpl { void dumpAllocationStats(raw_ostream &OS) const; }; -} // namespace llvm::detail void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const { // Dump memory allocation related stats. diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 1169f26a2ae37..97298f9d74171 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -655,16 +655,10 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) { BasicBlock *BB = BasicBlock::Create(M->getContext(), "", GuestExit); IRBuilder<> B(BB); - // Load the global symbol as a pointer to the check function. - Value *GuardFn; - if (cfguard_module_flag == 2 && !F->hasFnAttribute("guard_nocf")) - GuardFn = GuardFnCFGlobal; - else - GuardFn = GuardFnGlobal; - LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFn); - - // Create new call instruction. The CFGuard check should always be a call, - // even if the original CallBase is an Invoke or CallBr instruction. + // Create new call instruction. The call check should always be a call, + // even if the original CallBase is an Invoke or CallBr instructio. + // This is treated as a direct call, so do not use GuardFnCFGlobal. + LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFnGlobal); Function *Thunk = buildExitThunk(F->getFunctionType(), F->getAttributes()); CallInst *GuardCheck = B.CreateCall( GuardFnType, GuardCheckLoad, {F, Thunk}); diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 1b5a713bffdc9..34c85d588f9c4 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -601,6 +601,12 @@ def CSR_Win_AArch64_AAPCS_SwiftError def CSR_Win_AArch64_AAPCS_SwiftTail : CalleeSavedRegs<(sub CSR_Win_AArch64_AAPCS, X20, X22)>; +def CSR_Win_AArch64_RT_MostRegs + : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, (sequence "X%u", 9, 15))>; + +def CSR_Win_AArch64_RT_AllRegs + : CalleeSavedRegs<(add CSR_Win_AArch64_RT_MostRegs, (sequence "Q%u", 8, 31))>; + // The Control Flow Guard check call uses a custom calling convention that also // preserves X0-X8 and Q0-Q7. def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index cf344980cbaae..18e246e5af57d 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -81,10 +81,7 @@ namespace { class AArch64FastISel final : public FastISel { class Address { public: - using BaseKind = enum { - RegBase, - FrameIndexBase - }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 0f7b34c36055f..3ee4d58ca892c 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2380,13 +2380,6 @@ void AArch64FrameLowering::determineStackHazardSlot( return; } - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) { - LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable " - "sized objects or realignment\n"); - return; - } - // If another calling convention is explicitly set FPRs can't be promoted to // ZPR callee-saves. if (!is_contained({CallingConv::C, CallingConv::Fast, @@ -2402,6 +2395,7 @@ void AArch64FrameLowering::determineStackHazardSlot( assert(Subtarget.isSVEorStreamingSVEAvailable() && "Expected SVE to be available for PPRs"); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); // With SplitSVEObjects the CS hazard padding is placed between the // PPRs and ZPRs. If there are any FPR CS there would be a hazard between // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 60aa61e993b26..d08f9b94227a2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && - getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { - // Issue __sincos_stret if available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } else { - setOperationAction(ISD::FSINCOS, MVT::f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::f32, Expand); - } + // Issue __sincos_stret if available. + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Make floating-point constants legal for the large code model, so they don't // become loads from the constant pool. @@ -5346,35 +5340,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, return SDValue(); } -SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, - SelectionDAG &DAG) const { - // For iOS, we want to call an alternative entry point: __sincos_stret, - // which returns the values in two S / D registers. - SDLoc DL(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 - : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = getLibcallName(LC); - SDValue Callee = - DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); - - StructType *RetTy = StructType::get(ArgTy, ArgTy); - TargetLowering::CallLoweringInfo CLI(DAG); - CallingConv::ID CC = getLibcallCallingConv(LC); - CLI.setDebugLoc(DL) - .setChain(DAG.getEntryNode()) - .setLibCallee(CC, RetTy, Callee, std::move(Args)); - - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - return CallResult.first; -} - static MVT getSVEContainerType(EVT ContentTy); SDValue @@ -7723,8 +7688,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::FP_TO_SINT_SAT: case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG); - case ISD::FSINCOS: - return LowerFSINCOS(Op, DAG); case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG); case ISD::SET_ROUNDING: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 2cb8ed29f252a..70bfae717fb76 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -745,7 +745,6 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index e69fa32967a79..2ab7bf19da410 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1386,6 +1386,25 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (MOP.isReg() && MOP.isKill()) DefinedInBB.addReg(MOP.getReg()); + // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but + // only copies implicit defs and makes sure that each operand is only added + // once in case of duplicates. + auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1, + MachineBasicBlock::iterator MI2) { + SmallSetVector<Register, 4> Ops; + for (const MachineOperand &MO : + llvm::drop_begin(MI1->operands(), MI1->getDesc().getNumOperands())) + if (MO.isReg() && MO.isImplicit() && MO.isDef()) + Ops.insert(MO.getReg()); + for (const MachineOperand &MO : + llvm::drop_begin(MI2->operands(), MI2->getDesc().getNumOperands())) + if (MO.isReg() && MO.isImplicit() && MO.isDef()) + Ops.insert(MO.getReg()); + for (auto Op : Ops) + MIB.addDef(Op, RegState::Implicit); + }; + CopyImplicitOps(I, Paired); + // Erase the old instructions. I->eraseFromParent(); Paired->eraseFromParent(); diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp index 45b7120112af2..4df4d54e60c95 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp @@ -805,7 +805,7 @@ void AArch64PrologueEmitter::emitPrologue() { CFAOffset += SVEAllocs.BeforePPRs; assert(PPRRange.End == ZPRRange.Begin && "Expected ZPR callee saves after PPR locals"); - allocateStackSpace(PPRRange.End, RealignmentPadding, SVEAllocs.AfterPPRs, + allocateStackSpace(PPRRange.End, 0, SVEAllocs.AfterPPRs, EmitAsyncCFI && !HasFP, CFAOffset, MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs); CFAOffset += SVEAllocs.AfterPPRs; @@ -1318,6 +1318,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF, SEHEpilogueStartI = MBB.end(); } +void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI, + StackOffset Offset) { + // Other combinations could be supported, but are not currently needed. + assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 && + "expected negative offset (with optional fixed portion)"); + Register Base = AArch64::FP; + if (int64_t FixedOffset = Offset.getFixed()) { + // If we have a negative fixed offset, we need to first subtract it in a + // temporary register first (to avoid briefly deallocating the scalable + // portion of the offset). + Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP, + StackOffset::getFixed(FixedOffset), TII, + MachineInstr::FrameDestroy); + } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base, + StackOffset::getScalable(Offset.getScalable()), TII, + MachineInstr::FrameDestroy); +} + void AArch64EpilogueEmitter::emitEpilogue() { MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr(); if (MBB.end() != EpilogueEndI) { @@ -1418,6 +1438,7 @@ void AArch64EpilogueEmitter::emitEpilogue() { AfterCSRPopSize += ProloguePopSize; } } + // Move past the restores of the callee-saved registers. // If we plan on combining the sp bump of the local stack size and the callee // save stack size, we might need to adjust the CSR save and restore offsets. @@ -1483,7 +1504,6 @@ void AArch64EpilogueEmitter::emitEpilogue() { StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize; SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR}); - MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin; // Deallocate the SVE area. if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) { @@ -1510,28 +1530,25 @@ void AArch64EpilogueEmitter::emitEpilogue() { (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP : AArch64::SP; if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) { - // TODO: Support stack realigment and variable-sized objects. - assert( - SVELayout != SVEStackLayout::Split && - "unexpected stack realignment or variable sized objects with split " - "SVE stack objects"); - - Register CalleeSaveBase = AArch64::FP; - if (int64_t CalleeSaveBaseOffset = - AFI->getCalleeSaveBaseToFrameRecordOffset()) { - // If we have have an non-zero offset to the non-SVE CS base we need to - // compute the base address by subtracting the offest in a temporary - // register first (to avoid briefly deallocating the SVE CS). - CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister( - &AArch64::GPR64RegClass); - emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP, - StackOffset::getFixed(-CalleeSaveBaseOffset), TII, - MachineInstr::FrameDestroy); + if (ZPR.CalleeSavesSize || SVELayout != SVEStackLayout::Split) { + // The offset from the frame-pointer to the start of the ZPR saves. + StackOffset FPOffsetZPR = + -SVECalleeSavesSize - PPR.LocalsSize - + StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset()); + // Deallocate the stack space space by moving the SP to the start of the + // ZPR/PPR callee-save area. + moveSPBelowFP(ZPRRange.Begin, FPOffsetZPR); + } + // With split SVE, the predicates are stored in a separate area above the + // ZPR saves, so we must adjust the stack to the start of the PPRs. + if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) { + // The offset from the frame-pointer to the start of the PPR saves. + StackOffset FPOffsetPPR = -PPR.CalleeSavesSize; + // Move to the start of the PPR area. + assert(!FPOffsetPPR.getFixed() && "expected only scalable offset"); + emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::FP, + FPOffsetPPR, TII, MachineInstr::FrameDestroy); } - // The code below will deallocate the stack space space by moving the SP - // to the start of the SVE callee-save area. - emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase, - -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy); } else if (BaseForSVEDealloc == AArch64::SP) { auto NonSVELocals = StackOffset::getFixed(NumBytes); auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) + diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h index 6e0e28324a0ac..7f297b5d337b0 100644 --- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h +++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h @@ -180,6 +180,10 @@ class AArch64EpilogueEmitter final : public AArch64PrologueEpilogueCommon { private: bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const; + /// A helper for moving the SP to a negative offset from the FP, without + /// deallocating any stack in the range FP to FP + Offset. + void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset); + void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI, const DebugLoc &DL) const; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 5bfb19d9a7e61..a5048b9c9e61d 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -90,6 +90,16 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin()) return getDarwinCalleeSavedRegs(MF); + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return MF->getSubtarget<AArch64Subtarget>().isTargetWindows() + ? CSR_Win_AArch64_RT_MostRegs_SaveList + : CSR_AArch64_RT_MostRegs_SaveList; + + if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll) + return MF->getSubtarget<AArch64Subtarget>().isTargetWindows() + ? CSR_Win_AArch64_RT_AllRegs_SaveList + : CSR_AArch64_RT_AllRegs_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) return CSR_Win_AArch64_CFGuard_Check_SaveList; if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) { @@ -138,10 +148,6 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_AAPCS_SwiftError_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail) return CSR_AArch64_AAPCS_SwiftTail_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) - return CSR_AArch64_RT_MostRegs_SaveList; - if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll) - return CSR_AArch64_RT_AllRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::Win64) // This is for OSes other than Windows; Windows is a separate case further // above. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 5b5565afd62b1..197aae6e03cb1 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3007,9 +3007,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { llvm_unreachable("Unsupported register kind"); } -bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy) const { +bool AArch64TTIImpl::isSingleExtWideningInstruction( + unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { // A helper that returns a vector type from the given type. The number of // elements in type Ty determines the vector width. auto toVectorTy = [&](Type *ArgTy) { @@ -3027,48 +3027,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) return false; - // Determine if the operation has a widening variant. We consider both the - // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the - // instructions. - // - // TODO: Add additional widening operations (e.g., shl, etc.) once we - // verify that their extending operands are eliminated during code - // generation. Type *SrcTy = SrcOverrideTy; switch (Opcode) { - case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). - case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + case Instruction::Add: // UADDW(2), SADDW(2). + case Instruction::Sub: { // USUBW(2), SSUBW(2). // The second operand needs to be an extend if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType()); - } else + break; + } + + if (Opcode == Instruction::Sub) return false; - break; - case Instruction::Mul: { // SMULL(2), UMULL(2) - // Both operands need to be extends of the same type. - if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || - (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + + // UADDW(2), SADDW(2) can be commutted. + if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) { if (!SrcTy) SrcTy = toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType()); - } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) { - // If one of the operands is a Zext and the other has enough zero bits to - // be treated as unsigned, we can still general a umull, meaning the zext - // is free. - KnownBits Known = - computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); - if (Args[0]->getType()->getScalarSizeInBits() - - Known.Zero.countLeadingOnes() > - DstTy->getScalarSizeInBits() / 2) - return false; - if (!SrcTy) - SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(), - DstTy->getScalarSizeInBits() / 2)); - } else - return false; - break; + break; + } + return false; } default: return false; @@ -3099,6 +3080,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; } +Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy) const { + if (Opcode != Instruction::Add && Opcode != Instruction::Sub && + Opcode != Instruction::Mul) + return nullptr; + + // Exit early if DstTy is not a vector type whose elements are one of [i16, + // i32, i64]. SVE doesn't generally have the same set of instructions to + // perform an extend with the add/sub/mul. There are SMULLB style + // instructions, but they operate on top/bottom, requiring some sort of lane + // interleaving to be used with zext/sext. + unsigned DstEltSize = DstTy->getScalarSizeInBits(); + if (!useNeonVector(DstTy) || Args.size() != 2 || + (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) + return nullptr; + + auto getScalarSizeWithOverride = [&](const Value *V) { + if (SrcOverrideTy) + return SrcOverrideTy->getScalarSizeInBits(); + return cast<Instruction>(V) + ->getOperand(0) + ->getType() + ->getScalarSizeInBits(); + }; + + unsigned MaxEltSize = 0; + if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) || + (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + MaxEltSize = std::max(EltSize0, EltSize1); + } else if (isa<SExtInst, ZExtInst>(Args[0]) && + isa<SExtInst, ZExtInst>(Args[1])) { + unsigned EltSize0 = getScalarSizeWithOverride(Args[0]); + unsigned EltSize1 = getScalarSizeWithOverride(Args[1]); + // mul(sext, zext) will become smull(sext, zext) if the extends are large + // enough. + if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2) + return nullptr; + MaxEltSize = DstEltSize / 2; + } else if (Opcode == Instruction::Mul && + (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) { + // If one of the operands is a Zext and the other has enough zero bits + // to be treated as unsigned, we can still generate a umull, meaning the + // zext is free. + KnownBits Known = + computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL); + if (Args[0]->getType()->getScalarSizeInBits() - + Known.Zero.countLeadingOnes() > + DstTy->getScalarSizeInBits() / 2) + return nullptr; + + MaxEltSize = + getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]); + } else + return nullptr; + + if (MaxEltSize * 2 > DstEltSize) + return nullptr; + + Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2); + if (ExtTy->getPrimitiveSizeInBits() <= 64) + return nullptr; + return ExtTy; +} + // s/urhadd instructions implement the following pattern, making the // extends free: // %x = add ((zext i8 -> i16), 1) @@ -3159,7 +3207,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (I && I->hasOneUser()) { auto *SingleUser = cast<Instruction>(*I->user_begin()); SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); - if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) { + if (Type *ExtTy = isBinExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { + // The cost from Src->Src*2 needs to be added if required, the cost from + // Src*2->ExtTy is free. + if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) { + Type *DoubleSrcTy = + Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2); + return getCastInstrCost(Opcode, DoubleSrcTy, Src, + TTI::CastContextHint::None, CostKind); + } + + return 0; + } + + if (isSingleExtWideningInstruction( + SingleUser->getOpcode(), Dst, Operands, + Src != I->getOperand(0)->getType() ? Src : nullptr)) { // For adds only count the second operand as free if both operands are // extends but not the same operation. (i.e both operands are not free in // add(sext, zext)). @@ -3168,8 +3233,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, (isa<CastInst>(SingleUser->getOperand(1)) && cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode)) return 0; - } else // Others are free so long as isWideningInstruction returned true. + } else { + // Others are free so long as isSingleExtWideningInstruction + // returned true. return 0; + } } // The cast will be free for the s/urhadd instructions @@ -4148,6 +4216,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( })) return *PromotedCost; + // If the operation is a widening instruction (smull or umull) and both + // operands are extends the cost can be cheaper by considering that the + // operation will operate on the narrowest type size possible (double the + // largest input size) and a further extend. + if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) { + if (ExtTy != Ty) + return getArithmeticInstrCost(Opcode, ExtTy, CostKind) + + getCastInstrCost(Instruction::ZExt, Ty, ExtTy, + TTI::CastContextHint::None, CostKind); + return LT.first; + } + switch (ISD) { default: return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, @@ -4381,10 +4461,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // - two 2-cost i64 inserts, and // - two 1-cost muls. // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with - // LT.first = 2 the cost is 28. If both operands are extensions it will not - // need to scalarize so the cost can be cheaper (smull or umull). - // so the cost can be cheaper (smull or umull). - if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) + // LT.first = 2 the cost is 28. + if (LT.second != MVT::v2i64) return LT.first; return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() * (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) + @@ -6129,7 +6207,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } static bool containsDecreasingPointers(Loop *TheLoop, - PredicatedScalarEvolution *PSE) { + PredicatedScalarEvolution *PSE, + const DominatorTree &DT) { const auto &Strides = DenseMap<Value *, const SCEV *>(); for (BasicBlock *BB : TheLoop->blocks()) { // Scan the instructions in the block and look for addresses that are @@ -6138,8 +6217,8 @@ static bool containsDecreasingPointers(Loop *TheLoop, if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { Value *Ptr = getLoadStorePointerOperand(&I); Type *AccessTy = getLoadStoreType(&I); - if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, - /*ShouldCheckWrap=*/false) + if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides, + /*Assume=*/true, /*ShouldCheckWrap=*/false) .value_or(0) < 0) return true; } @@ -6184,7 +6263,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { // negative strides. This will require extra work to reverse the loop // predicate, which may be expensive. if (containsDecreasingPointers(TFI->LVL->getLoop(), - TFI->LVL->getPredicatedScalarEvolution())) + TFI->LVL->getPredicatedScalarEvolution(), + *TFI->LVL->getDominatorTree())) Required |= TailFoldingOpts::Reverse; if (Required == TailFoldingOpts::Disabled) Required |= TailFoldingOpts::Simple; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index b39546a9a381d..e3b0a1bec53ec 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -59,9 +59,17 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> { VECTOR_LDST_FOUR_ELEMENTS }; - bool isWideningInstruction(Type *DstTy, unsigned Opcode, - ArrayRef<const Value *> Args, - Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub/mul operation, detect a widening addl/subl/mull pattern + /// where both operands can be treated like extends. Returns the minimal type + /// needed to compute the operation. + Type *isBinExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; + /// Given a add/sub operation with a single extend operand, detect a + /// widening addw/subw pattern. + bool isSingleExtWideningInstruction(unsigned Opcode, Type *DstTy, + ArrayRef<const Value *> Args, + Type *SrcOverrideTy = nullptr) const; // A helper function called by 'getVectorInstrCost'. // diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 8669978637f40..56ab040706a13 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -38,9 +38,10 @@ enum ImplicitArgumentPositions { #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS, enum ImplicitArgumentMask { - NOT_IMPLICIT_INPUT = 0, + UNKNOWN_INTRINSIC = 0, #include "AMDGPUAttributes.def" - ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1 + ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1, + NOT_IMPLICIT_INPUT }; #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str}, @@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); return QUEUE_PTR; default: - return NOT_IMPLICIT_INPUT; + return UNKNOWN_INTRINSIC; } } @@ -534,6 +535,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { ImplicitArgumentMask AttrMask = intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, HasApertureRegs, SupportsGetDoorbellID, COV); + + if (AttrMask == UNKNOWN_INTRINSIC) { + // Assume not-nocallback intrinsics may invoke a function which accesses + // implicit arguments. + // + // FIXME: This isn't really the correct check. We want to ensure it + // isn't calling any function that may use implicit arguments regardless + // of whether it's internal to the module or not. + // + // TODO: Ignoring callsite attributes. + if (!Callee->hasFnAttribute(Attribute::NoCallback)) + return indicatePessimisticFixpoint(); + continue; + } + if (AttrMask != NOT_IMPLICIT_INPUT) { if ((IsNonEntryFunc || !NonKernelOnly)) removeAssumedBits(AttrMask); @@ -1357,7 +1373,10 @@ struct AAAMDGPUMinAGPRAlloc default: // Some intrinsics may use AGPRs, but if we have a choice, we are not // required to use AGPRs. - return true; + + // Assume !nocallback intrinsics may call a function which requires + // AGPRs. + return CB.hasFnAttr(Attribute::NoCallback); } // TODO: Handle callsite attributes diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 0c977416f1793..15ed60b46a9c0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI, if (!DstRC || DstRC != SrcRC) return false; - return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) && - RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI); + if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) || + !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI)) + return false; + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } + return true; } bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { @@ -602,6 +608,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32( I.setDesc(TII.get(Opc)); I.addOperand(*MF, MachineOperand::CreateImm(0)); I.addImplicitDefUseOperands(*MF); + I.getOperand(0).setIsEarlyClobber(true); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } @@ -3787,6 +3794,10 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const { MI.removeOperand(1); // Intrinsic ID MI.addOperand(VDst_In); // Readd VDst_In to the end MI.addImplicitDefUseOperands(*MI.getParent()->getParent()); + const MCInstrDesc &MCID = MI.getDesc(); + if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) { + MI.getOperand(0).setIsEarlyClobber(true); + } return true; } @@ -6753,7 +6764,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(2); std::optional<int64_t> BarValImm = getIConstantVRegSExtVal(BarOp.getReg(), *MRI); @@ -6806,8 +6817,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit( MachineInstr &I, Intrinsic::ID IntrID) const { MachineBasicBlock *MBB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand BarOp = I.getOperand(1); - MachineOperand CntOp = I.getOperand(2); + const MachineOperand &BarOp = I.getOperand(1); + const MachineOperand &CntOp = I.getOperand(2); // BarID = (BarOp >> 4) & 0x3F Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index dd474ac52c3c8..1e5885a25c195 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -913,6 +913,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}}); + addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}}); + addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard) .Uni(S64, {{Sgpr64}, {}}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 733c5d520fb23..fe81a5efd9d51 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( return NewRetBlock; } +static BasicBlock * +createDummyReturnBlock(Function &F, + SmallVector<BasicBlock *, 4> &ReturningBlocks) { + BasicBlock *DummyReturnBB = + BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F); + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); + ReturningBlocks.push_back(DummyReturnBB); + return DummyReturnBB; +} + +/// Handle conditional branch instructions (-> 2 targets) and callbr +/// instructions with N targets. +static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI, + BasicBlock *DummyReturnBB, + std::vector<DominatorTree::UpdateType> &Updates) { + SmallVector<BasicBlock *, 2> Successors(successors(BB)); + + // Create a new transition block to hold the conditional branch. + BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); + + Updates.reserve(Updates.size() + 2 * Successors.size() + 2); + + // 'Successors' become successors of TransitionBB instead of BB, + // and TransitionBB becomes a single successor of BB. + Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); + for (BasicBlock *Successor : Successors) { + Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); + Updates.emplace_back(DominatorTree::Delete, BB, Successor); + } + + // Create a branch that will always branch to the transition block and + // references DummyReturnBB. + BB->getTerminator()->eraseFromParent(); + BranchInst::Create(TransitionBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); + Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); +} + bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, const UniformityInfo &UA) { - assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator."); - if (PDT.root_size() == 0 || (PDT.root_size() == 1 && - !isa<BranchInst>(PDT.getRoot()->getTerminator()))) + !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator()))) return false; // Loop over all of the blocks in a function, tracking all of the blocks that @@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, if (HasDivergentExitBlock) UnreachableBlocks.push_back(BB); } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - - ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); - if (DummyReturnBB == nullptr) { - DummyReturnBB = BasicBlock::Create(F.getContext(), - "DummyReturnBlock", &F); - Type *RetTy = F.getReturnType(); - Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy); - ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); - ReturningBlocks.push_back(DummyReturnBB); - } + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); if (BI->isUnconditional()) { BasicBlock *LoopHeaderBB = BI->getSuccessor(0); BI->eraseFromParent(); // Delete the unconditional branch. // Add a new conditional branch with a dummy edge to the return block. - BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); - Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); - } else { // Conditional branch. - SmallVector<BasicBlock *, 2> Successors(successors(BB)); - - // Create a new transition block to hold the conditional branch. - BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock"); - - Updates.reserve(Updates.size() + 2 * Successors.size() + 2); - - // 'Successors' become successors of TransitionBB instead of BB, - // and TransitionBB becomes a single successor of BB. - Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB); - for (BasicBlock *Successor : Successors) { - Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor); - Updates.emplace_back(DominatorTree::Delete, BB, Successor); - } - - // Create a branch that will always branch to the transition block and - // references DummyReturnBB. - BB->getTerminator()->eraseFromParent(); - BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); + BranchInst::Create(LoopHeaderBB, DummyReturnBB, + ConstantInt::getTrue(F.getContext()), BB); Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB); + } else { + handleNBranch(F, BB, BI, DummyReturnBB, Updates); } Changed = true; + } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) { + if (!DummyReturnBB) + DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks); + + handleNBranch(F, BB, CBI, DummyReturnBB, Updates); + Changed = true; + } else { + llvm_unreachable("unsupported block terminator"); } } diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 52cc4ca5a955c..1a14629fb66b3 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -435,7 +435,7 @@ void GCNHazardRecognizer::RecedeCycle() { // Helper Functions //===----------------------------------------------------------------------===// -using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound }; +enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound }; using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>; using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>; diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 959ce6904ce4d..1682abbdea169 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -43,7 +43,7 @@ class GCNNSAReassignImpl { bool run(MachineFunction &MF); private: - using NSA_Status = enum { + enum NSA_Status { NOT_NSA, // Not an NSA instruction FIXED, // NSA which we cannot modify NON_CONTIGUOUS, // NSA with non-sequential address which we can try diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 4deb2a9485e4d..62172a0bb89db 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -136,7 +136,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { continue; if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) { - MachineOperand DefSrcMO = Def.getOperand(1); + const MachineOperand &DefSrcMO = Def.getOperand(1); // Immediates are not an issue and can be propagated in // postrapseudos pass. Only handle cases where defining diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 2aa54c920a046..31eca049fd149 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1129,12 +1129,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); - // TODO: can the chain be replaced without creating a new store? - SDValue NewStore = DAG.getTruncStore( - NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT, - StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(), - StoreNode->getAAInfo()); - StoreNode = cast<StoreSDNode>(NewStore); + SmallVector<SDValue, 4> NewOps(StoreNode->ops()); + NewOps[0] = NewChain; + StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps)); } return scalarizeVectorStore(StoreNode, DAG); diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 5c39f7a3d6daa..aa5ea77f17291 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -2170,7 +2170,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const { return MFI.getStackSize() != 0; } - return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() || + return (frameTriviallyRequiresSP(MFI) && + !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) || + MFI.isFrameAddressTaken() || MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment( MF) || mayReserveScratchForCWSR(MF) || diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 45f591927b86e..9460145d47111 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7945,7 +7945,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; @@ -7985,7 +7985,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, legalizeOperandsVALUt16(*NewInstr, MRI); legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); + const MachineOperand &SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); return; @@ -8183,7 +8183,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, AMDGPU::OpName::src0_modifiers) >= 0) NewInstr.addImm(0); if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) { - MachineOperand Src = Inst.getOperand(1); + const MachineOperand &Src = Inst.getOperand(1); NewInstr->addOperand(Src); } @@ -9199,7 +9199,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, +void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index dc23a21f959ce..0643b532ea04c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -172,7 +172,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI, SIInstrWorklist &Worklist) const; - void addSCCDefUsersToVALUWorklist(MachineOperand &Op, + void addSCCDefUsersToVALUWorklist(const MachineOperand &Op, MachineInstr &SCCDefInst, SIInstrWorklist &Worklist, Register NewCond = Register()) const; diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 7431e111ec862..8785968569d92 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), E = MI.getIterator(); I != E; ++I) { - if (I->isBundle()) + if (I->isBundle() || I->isDebugInstr()) continue; switch (I->getOpcode()) { case AMDGPU::S_SET_GPR_IDX_MODE: @@ -640,7 +640,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( } void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { - MachineOperand DstOp = I.getOperand(0); + const MachineOperand &DstOp = I.getOperand(0); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() && diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 4ae2c1ed04dae..31d8bce4d0c87 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1707,7 +1707,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2"); defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; - let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in { let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; @@ -1734,7 +1734,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P let mayRaiseFPException = 0; let ReadsModeReg = 0; let AsmMatchConverter = "cvtSWMMAC"; - + let isConvergent = 1; let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef"; } } @@ -1906,8 +1906,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">; } // End is_wmma_xdl = 1. -defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; -defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +let isConvergent = 1 in { + defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>; + defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>; +} } // End SubtargetPredicate = isGFX125xOnly } // End WaveSizePredicate = isWave32 diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 14e1160e70dae..88d3b6f7d5bb9 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -86,7 +86,7 @@ namespace { // All possible address modes, plus some. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 6b0653457cbaf..92fae71121a81 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_, setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); } - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // FP-ARMv8 implements a lot of rounding-like FP operations. if (Subtarget->hasFPARMv8Base()) { @@ -9855,76 +9855,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); } -SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { - // For iOS, we want to call an alternative entry point: __sincos_stret, - // return values are passed via sret. - SDLoc dl(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT); - RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC); - if (SincosStret == RTLIB::Unsupported) - return SDValue(); - - assert(Subtarget->isTargetDarwin()); - - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - auto PtrVT = getPointerTy(DAG.getDataLayout()); - - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - - // Pair of floats / doubles used to pass the result. - Type *RetTy = StructType::get(ArgTy, ArgTy); - auto &DL = DAG.getDataLayout(); - - ArgListTy Args; - bool ShouldUseSRet = getTM().isAPCS_ABI(); - SDValue SRet; - if (ShouldUseSRet) { - // Create stack object for sret. - const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); - const Align StackAlign = DL.getPrefTypeAlign(RetTy); - int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); - SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); - - ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext())); - Entry.IsSExt = false; - Entry.IsZExt = false; - Entry.IsSRet = true; - Args.push_back(Entry); - RetTy = Type::getVoidTy(*DAG.getContext()); - } - - Args.emplace_back(Arg, ArgTy); - - StringRef LibcallName = getLibcallImplName(SincosStret); - CallingConv::ID CC = getLibcallImplCallingConv(SincosStret); - SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL)); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setCallee(CC, RetTy, Callee, std::move(Args)) - .setDiscardResult(ShouldUseSRet); - std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); - - if (!ShouldUseSRet) - return CallResult.first; - - SDValue LoadSin = - DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); - - // Address of cos field. - SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, - DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); - SDValue LoadCos = - DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); - - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, - LoadSin.getValue(0), LoadCos.getValue(0)); -} - SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const { @@ -10726,8 +10656,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECREDUCE_SMAX: return LowerVecReduceMinMax(Op, DAG, Subtarget); case ISD::ATOMIC_LOAD: - case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); + case ISD::ATOMIC_STORE: + return LowerAtomicLoadStore(Op, DAG); case ISD::SDIVREM: case ISD::UDIVREM: return LowerDivRem(Op, DAG); case ISD::DYNAMIC_STACKALLOC: diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index bf3438b0d8803..bc2fec3c1bdb5 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -901,7 +901,6 @@ class VectorType; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 9b250e6cac3ab..24f58a68c345d 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2448,7 +2448,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { // static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, - const LoopAccessInfo *LAI) { + const LoopAccessInfo *LAI, + const DominatorTree &DT) { LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); // If there are live-out values, it is probably a reduction. We can predicate @@ -2498,7 +2499,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, if (isa<StoreInst>(I) || isa<LoadInst>(I)) { Value *Ptr = getLoadStorePointerOperand(&I); Type *AccessTy = getLoadStoreType(&I); - int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0); + int64_t NextStride = + getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0); if (NextStride == 1) { // TODO: for now only allow consecutive strides of 1. We could support // other strides as long as it is uniform, but let's keep it simple @@ -2585,7 +2587,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; } - return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI()); + return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(), + *LVL->getDominatorTree()); } TailFoldingStyle diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp index 77dc4a75a7d68..abe081c0c76fd 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp @@ -88,6 +88,16 @@ bool BPFAsmPrinter::doFinalization(Module &M) { } } + for (GlobalObject &GO : M.global_objects()) { + if (!GO.hasExternalWeakLinkage()) + continue; + + if (!SawTrapCall && GO.getName() == BPF_TRAP) { + GO.eraseFromParent(); + break; + } + } + return AsmPrinter::doFinalization(M); } @@ -160,6 +170,16 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) { + if (MI->isCall()) { + for (const MachineOperand &Op : MI->operands()) { + if (Op.isGlobal()) { + if (const GlobalValue *GV = Op.getGlobal()) + if (GV->getName() == BPF_TRAP) + SawTrapCall = true; + } + } + } + BPF_MC::verifyInstructionPredicates(MI->getOpcode(), getSubtargetInfo().getFeatureBits()); @@ -195,6 +215,10 @@ void BPFAsmPrinter::emitJumpTableInfo() { const TargetLoweringObjectFile &TLOF = getObjFileLowering(); const Function &F = MF->getFunction(); + + MCSection *Sec = OutStreamer->getCurrentSectionOnly(); + MCSymbol *SecStart = Sec->getBeginSymbol(); + MCSection *JTS = TLOF.getSectionForJumpTable(F, TM); assert(MJTI->getEntryKind() == MachineJumpTableInfo::EK_BlockAddress); unsigned EntrySize = MJTI->getEntrySize(getDataLayout()); @@ -207,8 +231,10 @@ void BPFAsmPrinter::emitJumpTableInfo() { MCSymbol *JTStart = getJTPublicSymbol(JTI); OutStreamer->emitLabel(JTStart); for (const MachineBasicBlock *MBB : JTBBs) { - const MCExpr *LHS = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); - OutStreamer->emitValue(LHS, EntrySize); + const MCExpr *Diff = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(MBB->getSymbol(), OutContext), + MCSymbolRefExpr::create(SecStart, OutContext), OutContext); + OutStreamer->emitValue(Diff, EntrySize); } const MCExpr *JTSize = MCConstantExpr::create(JTBBs.size() * EntrySize, OutContext); diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h index 90ef2073609a6..75a1d7ed9f884 100644 --- a/llvm/lib/Target/BPF/BPFAsmPrinter.h +++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h @@ -39,6 +39,7 @@ class BPFAsmPrinter : public AsmPrinter { private: BTFDebug *BTF; TargetMachine &TM; + bool SawTrapCall = false; const BPFTargetMachine &getBTM() const; }; diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 8ace2d2777c74..eb4c8846441a2 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -194,9 +194,10 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) { dxbc::PSV::v2::ResourceBindInfo BindInfo; BindInfo.Type = Type; BindInfo.LowerBound = Binding.LowerBound; - assert(Binding.Size == UINT32_MAX || - (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX && - "Resource range is too large"); + assert( + (Binding.Size == UINT32_MAX || + (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX) && + "Resource range is too large"); BindInfo.UpperBound = (Binding.Size == UINT32_MAX) ? UINT32_MAX : Binding.LowerBound + Binding.Size - 1; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 7ae500a55b92d..67437f6969b27 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -1079,6 +1079,15 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> { let attributes = [Attributes<DXIL1_0, []>]; } +def LegacyF16ToF32 : DXILOp<131, legacyF16ToF32> { + let Doc = "returns the float16 stored in the low-half of the uint converted " + "to a float"; + let intrinsics = [IntrinSelect<int_dx_legacyf16tof32>]; + let arguments = [Int32Ty]; + let result = FloatTy; + let stages = [Stages<DXIL1_0, [all_stages]>]; +} + def WaveAllBitCount : DXILOp<135, waveAllOp> { let Doc = "returns the count of bits set to 1 across the wave"; let intrinsics = [IntrinSelect<int_dx_wave_active_countbits>]; diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 60dfd9650937c..6cacbf6564db2 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -29,11 +29,12 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx) const { switch (ID) { case Intrinsic::dx_asdouble: - case Intrinsic::dx_isinf: - case Intrinsic::dx_isnan: case Intrinsic::dx_firstbitlow: - case Intrinsic::dx_firstbituhigh: case Intrinsic::dx_firstbitshigh: + case Intrinsic::dx_firstbituhigh: + case Intrinsic::dx_isinf: + case Intrinsic::dx_isnan: + case Intrinsic::dx_legacyf16tof32: return OpdIdx == 0; default: return OpdIdx == -1; @@ -50,6 +51,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_frac: case Intrinsic::dx_isinf: case Intrinsic::dx_isnan: + case Intrinsic::dx_legacyf16tof32: case Intrinsic::dx_rsqrt: case Intrinsic::dx_saturate: case Intrinsic::dx_splitdouble: diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index fe700e17d341b..cf4ffc82f6009 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -6630,6 +6630,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0), N->getOperand(1)); break; + case Intrinsic::loongarch_lasx_concat_128_s: + case Intrinsic::loongarch_lasx_concat_128_d: + case Intrinsic::loongarch_lasx_concat_128: + return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), + N->getOperand(1), N->getOperand(2)); } return SDValue(); } diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index b502b056c4cdf..00d52870f1727 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -2113,6 +2113,37 @@ defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64, 2, sub_128>; defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8, sub_128>; defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8, 16, sub_128>; +// LASX and LSX conversion +def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; +def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; +def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; } // Predicates = [HasExtLASX] /// Intrinsic pattern diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index df0c8c13fa38d..06210b6b91b93 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -82,7 +82,7 @@ class MipsFastISel final : public FastISel { // All possible address modes. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index c667a09f95dbb..996d653940118 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1836,7 +1836,7 @@ bool NVPTXDAGToDAGISel::tryFence(SDNode *N) { return true; } -NVPTXScopes::NVPTXScopes(LLVMContext &C) { +NVPTXScopes::NVPTXScopes(LLVMContext &C) : Context(&C) { Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread; Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System; Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block; @@ -1851,11 +1851,21 @@ NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const { auto S = Scopes.find(ID); if (S == Scopes.end()) { - // TODO: - // - Add API to LLVMContext to get the name of a single scope. - // - Use that API here to print an error containing the name - // of this Unknown ID. - report_fatal_error(formatv("Could not find scope ID={}.", int(ID))); + auto scopeName = Context->getSyncScopeName(ID); + assert(scopeName.has_value() && "Scope name must exist."); + + // Build list of supported syncscopes programmatically + SmallVector<StringRef> supportedScopes; + for (const auto &Entry : Scopes) { + if (auto name = Context->getSyncScopeName(Entry.first)) + supportedScopes.push_back(name->empty() ? "<empty string>" : *name); + } + + reportFatalUsageError( + formatv("NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n" + "Supported syncscopes are: {2}.", + scopeName.value(), int(ID), + make_range(supportedScopes.begin(), supportedScopes.end()))); } return S->second; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 1cb579bd96730..d525531766ddf 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -35,6 +35,7 @@ struct NVPTXScopes { private: SmallMapVector<SyncScope::ID, NVPTX::Scope, 8> Scopes{}; + LLVMContext *Context = nullptr; }; class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b26022184708c..f0bdf472b96ed 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2267,7 +2267,7 @@ def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>; def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>; // fpextend bf16 -> f32 -def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>; +def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ, hasPTX<78>, hasSM<90>]>; def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>; // fpextend f16 -> f64 diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index da3efdc15f1e1..0c2e44e18f463 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -360,6 +360,10 @@ let Predicates = [HasVSX, IsISAFuture] in { def LXVPRLL : XForm_XTp5_RAB5<31, 621, (outs vsrprc:$XTp), (ins (memr $RA):$addr, g8rc:$RB), "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; + def LXVPB32X + : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp), + (ins (memr $RA):$addr, g8rc:$RB), + "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } let mayStore = 1 in { @@ -376,6 +380,10 @@ let Predicates = [HasVSX, IsISAFuture] in { : XForm_XTp5_RAB5<31, 749, (outs), (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; + def STXVPB32X + : XForm_XTp5_RAB5<31, 1005, (outs), + (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), + "stxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } def VUPKHSNTOB : VXForm_VRTB5<387, 0, (outs vrrc:$VRT), (ins vrrc:$VRB), diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td index b38dd4ae948c6..fc3cde3f464bb 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td +++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td @@ -202,7 +202,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase, RegConstraint<"@earlyclobber $AT">; def PM#NAME#WPP : MMIRR_XX3Form_XY4P2_XAB6< - opcode, !or(xo, 0x20), (outs acc:$AT), + opcode, !or(xo, 0x20), (outs wacc:$AT), !con((ins wacc:$ATi), !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))), !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"), @@ -765,7 +765,7 @@ let Predicates = [MMA, IsISAFuture] in { def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), (XVF64GERWPN $ATi, $XA, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)), - (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>; + (XVF64GERWNP $ATi, $XA, RCCp.BToVSRC)>; def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)), (XVF64GERWNN $ATi, $XA, RCCp.BToVSRC)>; diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 282cf5d681685..3d5a55c631301 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -95,7 +95,8 @@ class RISCVInstructionSelector : public InstructionSelector { void addVectorLoadStoreOperands(MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp, bool IsMasked, - bool IsStrided) const; + bool IsStridedOrIndexed, + LLT *IndexVT = nullptr) const; bool selectIntrinsicWithSideEffects(MachineInstr &I, MachineIRBuilder &MIB) const; @@ -722,15 +723,17 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) { void RISCVInstructionSelector::addVectorLoadStoreOperands( MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp, - bool IsMasked, bool IsStrided) const { + bool IsMasked, bool IsStridedOrIndexed, LLT *IndexVT) const { // Base Pointer auto PtrReg = I.getOperand(CurOp++).getReg(); SrcOps.push_back(PtrReg); - // Stride - if (IsStrided) { + // Stride or Index + if (IsStridedOrIndexed) { auto StrideReg = I.getOperand(CurOp++).getReg(); SrcOps.push_back(StrideReg); + if (IndexVT) + *IndexVT = MRI->getType(StrideReg); } // Mask @@ -805,6 +808,70 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( I.eraseFromParent(); return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); } + case Intrinsic::riscv_vloxei: + case Intrinsic::riscv_vloxei_mask: + case Intrinsic::riscv_vluxei: + case Intrinsic::riscv_vluxei_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vloxei_mask || + IntrinID == Intrinsic::riscv_vluxei_mask; + bool IsOrdered = IntrinID == Intrinsic::riscv_vloxei || + IntrinID == Intrinsic::riscv_vloxei_mask; + LLT VT = MRI->getType(I.getOperand(0).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Result vector + const Register DstReg = I.getOperand(0).getReg(); + + // Sources + bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm; + unsigned CurOp = 2; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Passthru + if (HasPassthruOperand) { + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + } else { + // Use NoRegister if there is no specified passthru. + SrcOps.push_back(Register()); + } + LLT IndexVT; + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + RISCVVType::VLMUL IndexLMUL = + RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT)); + unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } + const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo( + IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), + static_cast<unsigned>(IndexLMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Policy + uint64_t Policy = RISCVVType::MASK_AGNOSTIC; + if (IsMasked) + Policy = I.getOperand(CurOp++).getImm(); + PseudoMI.addImm(Policy); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } case Intrinsic::riscv_vsm: case Intrinsic::riscv_vse: case Intrinsic::riscv_vse_mask: @@ -847,6 +914,56 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects( I.eraseFromParent(); return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); } + case Intrinsic::riscv_vsoxei: + case Intrinsic::riscv_vsoxei_mask: + case Intrinsic::riscv_vsuxei: + case Intrinsic::riscv_vsuxei_mask: { + bool IsMasked = IntrinID == Intrinsic::riscv_vsoxei_mask || + IntrinID == Intrinsic::riscv_vsuxei_mask; + bool IsOrdered = IntrinID == Intrinsic::riscv_vsoxei || + IntrinID == Intrinsic::riscv_vsoxei_mask; + LLT VT = MRI->getType(I.getOperand(1).getReg()); + unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); + + // Sources + unsigned CurOp = 1; + SmallVector<SrcOp, 4> SrcOps; // Source registers. + + // Store value + auto PassthruReg = I.getOperand(CurOp++).getReg(); + SrcOps.push_back(PassthruReg); + + LLT IndexVT; + addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT); + + RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT)); + RISCVVType::VLMUL IndexLMUL = + RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT)); + unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); + if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); + } + const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo( + IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), + static_cast<unsigned>(IndexLMUL)); + + auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps); + + // Select VL + auto VLOpFn = renderVLOp(I.getOperand(CurOp++)); + for (auto &RenderFn : *VLOpFn) + RenderFn(PseudoMI); + + // SEW + PseudoMI.addImm(Log2SEW); + + // Memref + PseudoMI.cloneMemRefs(I); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI); + } } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index e75dfe33814c6..5b8cfb2100b26 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -407,7 +407,6 @@ enum OperandType : unsigned { OPERAND_SIMM5_PLUS1, OPERAND_SIMM6, OPERAND_SIMM6_NONZERO, - OPERAND_SIMM8, OPERAND_SIMM8_UNSIGNED, OPERAND_SIMM10, OPERAND_SIMM10_LSB0000_NONZERO, diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 526675a682d86..b0453fc57c053 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -131,6 +131,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoCCMAXU: case RISCV::PseudoCCMIN: case RISCV::PseudoCCMINU: + case RISCV::PseudoCCMUL: case RISCV::PseudoCCADDW: case RISCV::PseudoCCSUBW: case RISCV::PseudoCCSLL: @@ -237,6 +238,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCMIN: NewOpc = RISCV::MIN; break; case RISCV::PseudoCCMAXU: NewOpc = RISCV::MAXU; break; case RISCV::PseudoCCMINU: NewOpc = RISCV::MINU; break; + case RISCV::PseudoCCMUL: NewOpc = RISCV::MUL; break; case RISCV::PseudoCCADDI: NewOpc = RISCV::ADDI; break; case RISCV::PseudoCCSLLI: NewOpc = RISCV::SLLI; break; case RISCV::PseudoCCSRLI: NewOpc = RISCV::SRLI; break; diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index cfee6ab22d4ff..5b72334f58d45 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1856,6 +1856,11 @@ def TuneShortForwardBranchIMinMax "true", "Enable short forward branch optimization for min,max instructions in Zbb", [TuneShortForwardBranchOpt]>; +def TuneShortForwardBranchIMul + : SubtargetFeature<"short-forward-branch-i-mul", "HasShortForwardBranchIMul", + "true", "Enable short forward branch optimization for mul instruction", + [TuneShortForwardBranchOpt]>; + // Some subtargets require a S2V transfer buffer to move scalars into vectors. // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure. def TuneNoSinkSplatOperands diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index b25a05400fe31..907833513c5d1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -371,8 +371,8 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned NF, bool IsMasked, RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); } const RISCV::VLXSEGPseudo *P = RISCV::getVLXSEGPseudo( NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -444,8 +444,8 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned NF, bool IsMasked, RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for index " + "values when XLEN=32"); } const RISCV::VSXSEGPseudo *P = RISCV::getVSXSEGPseudo( NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -2223,8 +2223,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for " + "index values when XLEN=32"); } const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo( IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL), @@ -2457,8 +2457,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT); unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits()); if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) { - report_fatal_error("The V extension does not support EEW=64 for index " - "values when XLEN=32"); + reportFatalUsageError("The V extension does not support EEW=64 for " + "index values when XLEN=32"); } const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo( IsMasked, IsOrdered, IndexLog2EEW, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e0cf739f67d9b..c3f100e3197b1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -9186,7 +9186,7 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG, unsigned ShAmount = Log2_64(TrueM1); if (Subtarget.hasShlAdd(ShAmount)) return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, CondV, - DAG.getConstant(ShAmount, DL, VT), CondV); + DAG.getTargetConstant(ShAmount, DL, VT), CondV); } } // (select c, y, 0) -> -c & y @@ -15463,7 +15463,7 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG, SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, NL, - DAG.getConstant(Diff, DL, VT), NS); + DAG.getTargetConstant(Diff, DL, VT), NS); return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT)); } @@ -15501,7 +15501,7 @@ static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other, int64_t AddConst = AddVal.getSExtValue(); SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0), - DAG.getConstant(ShlConst, DL, VT), Other); + DAG.getTargetConstant(ShlConst, DL, VT), Other); return DAG.getNode(ISD::ADD, DL, VT, SHADD, DAG.getSignedConstant(AddConst, DL, VT)); } @@ -16495,6 +16495,35 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Op, DL, VT, Shift1, Shift2); } +static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX, + unsigned ShY) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue X = N->getOperand(0); + SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ShY, DL, VT), X); + return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, + DAG.getTargetConstant(ShX, DL, VT), Mul359); +} + +static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG, + uint64_t MulAmt) { + switch (MulAmt) { + case 5 * 3: + return getShlAddShlAdd(N, DAG, 2, 1); + case 9 * 3: + return getShlAddShlAdd(N, DAG, 3, 1); + case 5 * 5: + return getShlAddShlAdd(N, DAG, 2, 2); + case 9 * 5: + return getShlAddShlAdd(N, DAG, 3, 2); + case 9 * 9: + return getShlAddShlAdd(N, DAG, 3, 3); + default: + return SDValue(); + } +} + // Try to expand a scalar multiply to a faster sequence. static SDValue expandMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -16524,18 +16553,17 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue())) return SDValue(); - // WARNING: The code below is knowingly incorrect with regards to undef semantics. - // We're adding additional uses of X here, and in principle, we should be freezing - // X before doing so. However, adding freeze here causes real regressions, and no - // other target properly freezes X in these cases either. - SDValue X = N->getOperand(0); - + // WARNING: The code below is knowingly incorrect with regards to undef + // semantics. We're adding additional uses of X here, and in principle, we + // should be freezing X before doing so. However, adding freeze here causes + // real regressions, and no other target properly freezes X in these cases + // either. if (Subtarget.hasShlAdd(3)) { + SDValue X = N->getOperand(0); int Shift; if (int ShXAmount = isShifted359(MulAmt, Shift)) { // 3/5/9 * 2^N -> shl (shXadd X, X), N SDLoc DL(N); - SDValue X = N->getOperand(0); // Put the shift first if we can fold a zext into the shift forming // a slli.uw. if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) && @@ -16543,49 +16571,19 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT)); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl, - DAG.getConstant(ShXAmount, DL, VT), Shl); + DAG.getTargetConstant(ShXAmount, DL, VT), Shl); } // Otherwise, put the shl second so that it can fold with following // instructions (e.g. sext or add). SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShXAmount, DL, VT), X); + DAG.getTargetConstant(ShXAmount, DL, VT), X); return DAG.getNode(ISD::SHL, DL, VT, Mul359, DAG.getConstant(Shift, DL, VT)); } // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X) - int ShX; - int ShY; - switch (MulAmt) { - case 3 * 5: - ShY = 1; - ShX = 2; - break; - case 3 * 9: - ShY = 1; - ShX = 3; - break; - case 5 * 5: - ShX = ShY = 2; - break; - case 5 * 9: - ShY = 2; - ShX = 3; - break; - case 9 * 9: - ShX = ShY = 3; - break; - default: - ShX = ShY = 0; - break; - } - if (ShX) { - SDLoc DL(N); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShY, DL, VT), X); - return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(ShX, DL, VT), Mul359); - } + if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt)) + return V; // If this is a power 2 + 2/4/8, we can use a shift followed by a single // shXadd. First check if this a sum of two power of 2s because that's @@ -16598,7 +16596,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), Shift1); + DAG.getTargetConstant(ScaleShift, DL, VT), Shift1); } } @@ -16611,10 +16609,11 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, assert(Shift != 0 && "MulAmt=4,6,10 handled before"); if (Shift <= 3) { SDLoc DL(N); - SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ShXAmount, DL, VT), X); + SDValue Mul359 = + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ShXAmount, DL, VT), X); return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359, - DAG.getConstant(Shift, DL, VT), X); + DAG.getTargetConstant(Shift, DL, VT), X); } } @@ -16626,9 +16625,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT)); - return DAG.getNode(ISD::ADD, DL, VT, Shift1, - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(ScaleShift, DL, VT), X)); + return DAG.getNode( + ISD::ADD, DL, VT, Shift1, + DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, + DAG.getTargetConstant(ScaleShift, DL, VT), X)); } } @@ -16643,28 +16643,17 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT)); SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Offset - 1), DL, VT), X); + DAG.getTargetConstant(Log2_64(Offset - 1), DL, VT), X); return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359); } } - for (uint64_t Divisor : {3, 5, 9}) { - if (MulAmt % Divisor != 0) - continue; - uint64_t MulAmt2 = MulAmt / Divisor; - // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples - // of 25 which happen to be quite common. - if (int ShBAmount = isShifted359(MulAmt2, Shift)) { - SDLoc DL(N); - SDValue Mul359A = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X, - DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X); - SDValue Mul359B = - DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A, - DAG.getConstant(ShBAmount, DL, VT), Mul359A); - return DAG.getNode(ISD::SHL, DL, VT, Mul359B, - DAG.getConstant(Shift, DL, VT)); - } + // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples + // of 25 which happen to be quite common. + Shift = llvm::countr_zero(MulAmt); + if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) { + SDLoc DL(N); + return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT)); } } @@ -25320,3 +25309,12 @@ ArrayRef<MCPhysReg> RISCVTargetLowering::getRoundingControlRegisters() const { } return {}; } + +bool RISCVTargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { + EVT VT = Y.getValueType(); + + if (VT.isVector()) + return false; + + return VT.getSizeInBits() <= Subtarget.getXLen(); +} diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 9e3e2a9443625..dd62a9cf6c9e2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -465,6 +465,8 @@ class RISCVTargetLowering : public TargetLowering { ArrayRef<MCPhysReg> getRoundingControlRegisters() const override; + bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; + /// Match a mask which "spreads" the leading elements of a vector evenly /// across the result. Factor is the spread amount, and Index is the /// offset applied. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index c9df787e0012d..b8ab70bd9e386 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1703,6 +1703,7 @@ unsigned getPredicatedOpcode(unsigned Opcode) { case RISCV::MAXU: return RISCV::PseudoCCMAXU; case RISCV::MIN: return RISCV::PseudoCCMIN; case RISCV::MINU: return RISCV::PseudoCCMINU; + case RISCV::MUL: return RISCV::PseudoCCMUL; case RISCV::ADDI: return RISCV::PseudoCCADDI; case RISCV::SLLI: return RISCV::PseudoCCSLLI; @@ -1754,6 +1755,9 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg, MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU)) return nullptr; + if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL) + return nullptr; + // Check if MI can be predicated and folded into the CCMOV. if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END) return nullptr; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 5a67a5aaba293..494b1c9f98839 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -110,6 +110,7 @@ def PseudoCCMAX : SFBALU_rr; def PseudoCCMIN : SFBALU_rr; def PseudoCCMAXU : SFBALU_rr; def PseudoCCMINU : SFBALU_rr; +def PseudoCCMUL : SFBALU_rr; def PseudoCCADDI : SFBALU_ri; def PseudoCCANDI : SFBALU_ri; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td index b37ceaaee9cf4..c2b25c6294019 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td @@ -60,6 +60,8 @@ def immfour : RISCVOp { let DecoderMethod = "decodeImmFourOperand"; } +def tuimm2 : TImmLeaf<XLenVT, [{return isUInt<2>(Imm);}]>; + //===----------------------------------------------------------------------===// // Instruction class templates //===----------------------------------------------------------------------===// @@ -557,8 +559,8 @@ multiclass VPatTernaryVMAQA_VV_VX<string intrinsic, string instruction, let Predicates = [HasVendorXTHeadBa] in { def : Pat<(add_like_non_imm12 (shl GPR:$rs2, uimm2:$uimm2), (XLenVT GPR:$rs1)), (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>; -def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, uimm2:$uimm2, GPR:$rs1)), - (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>; +def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, tuimm2:$uimm2, GPR:$rs1)), + (TH_ADDSL GPR:$rs1, GPR:$rs2, tuimm2:$uimm2)>; // Reuse complex patterns from StdExtZba def : Pat<(add_like_non_imm12 sh1add_op:$rs2, (XLenVT GPR:$rs1)), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 4537bfe8025ca..8376da52be53e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -53,6 +53,8 @@ def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT, let OperandType = "OPERAND_UIMM5_GT3"; } +def tuimm5gt3 : TImmLeaf<XLenVT, [{return (Imm > 3) && isUInt<5>(Imm);}]>; + def UImm5Plus1AsmOperand : AsmOperandClass { let Name = "UImm5Plus1"; let RenderMethod = "addImmOperands"; @@ -1419,8 +1421,8 @@ def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))), (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>; def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)), (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; -def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)), - (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; +def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 tuimm5gt3:$imm), GPRNoX0:$rs2)), + (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, tuimm5gt3:$imm)>; } // Predicates = [HasVendorXqciac, IsRV32] /// Simple arithmetic operations diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index c31713e967b18..1c6a5afcda49b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -90,6 +90,7 @@ defvar ZfhminDExts = [ZfhminDExt, ZhinxminZdinxExt, ZhinxminZdinx32Ext]; //===----------------------------------------------------------------------===// let Predicates = [HasHalfFPLoadStoreMove] in { +let canFoldAsLoad = 1 in def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp index 640b014646f36..0175f2fb3698b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp @@ -577,6 +577,11 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) { if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size")) outputExecutionModeFromMDNode(FReg, Node, SPIRV::ExecutionMode::SubgroupSize, 0, 0); + if (MDNode *Node = F.getMetadata("max_work_group_size")) { + if (ST->canUseExtension(SPIRV::Extension::SPV_INTEL_kernel_attributes)) + outputExecutionModeFromMDNode( + FReg, Node, SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, 3, 1); + } if (MDNode *Node = F.getMetadata("vec_type_hint")) { MCInst Inst; Inst.setOpcode(SPIRV::OpExecutionMode); diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index 9e11c3a281a1b..dd57b74d79a5e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -149,23 +149,23 @@ static FunctionType *getOriginalFunctionType(const Function &F) { return isa<MDString>(N->getOperand(0)) && cast<MDString>(N->getOperand(0))->getString() == F.getName(); }); - // TODO: probably one function can have numerous type mutations, - // so we should support this. if (ThisFuncMDIt != NamedMD->op_end()) { auto *ThisFuncMD = *ThisFuncMDIt; - MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(1)); - assert(MD && "MDNode operand is expected"); - ConstantInt *Const = getConstInt(MD, 0); - if (Const) { - auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1)); - assert(CMeta && "ConstantAsMetadata operand is expected"); - assert(Const->getSExtValue() >= -1); - // Currently -1 indicates return value, greater values mean - // argument numbers. - if (Const->getSExtValue() == -1) - RetTy = CMeta->getType(); - else - ArgTypes[Const->getSExtValue()] = CMeta->getType(); + for (unsigned I = 1; I != ThisFuncMD->getNumOperands(); ++I) { + MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(I)); + assert(MD && "MDNode operand is expected"); + ConstantInt *Const = getConstInt(MD, 0); + if (Const) { + auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1)); + assert(CMeta && "ConstantAsMetadata operand is expected"); + assert(Const->getSExtValue() >= -1); + // Currently -1 indicates return value, greater values mean + // argument numbers. + if (Const->getSExtValue() == -1) + RetTy = CMeta->getType(); + else + ArgTypes[Const->getSExtValue()] = CMeta->getType(); + } } } diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 96f5dee21bc2a..43b2869cecdf7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -107,6 +107,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> SPIRV::Extension::Extension::SPV_INTEL_inline_assembly}, {"SPV_INTEL_bindless_images", SPIRV::Extension::Extension::SPV_INTEL_bindless_images}, + {"SPV_INTEL_bfloat16_arithmetic", + SPIRV::Extension::Extension::SPV_INTEL_bfloat16_arithmetic}, {"SPV_INTEL_bfloat16_conversion", SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion}, {"SPV_KHR_subgroup_rotate", @@ -155,7 +157,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>> {"SPV_INTEL_predicated_io", SPIRV::Extension::Extension::SPV_INTEL_predicated_io}, {"SPV_KHR_maximal_reconvergence", - SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}}; + SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}, + {"SPV_INTEL_kernel_attributes", + SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}}; bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName, StringRef ArgValue, diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 3f0424f436c72..245e5a2894604 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -3516,6 +3516,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, case Intrinsic::spv_resource_nonuniformindex: { return selectResourceNonUniformIndex(ResVReg, ResType, I); } + case Intrinsic::spv_unpackhalf2x16: { + return selectExtInst(ResVReg, ResType, I, GL::UnpackHalf2x16); + } + default: { std::string DiagMsg; raw_string_ostream OS(DiagMsg); diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index db036a55ee6c6..e5ac76c405841 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1435,6 +1435,8 @@ void addInstrRequirements(const MachineInstr &MI, addPrintfRequirements(MI, Reqs, ST); break; } + // TODO: handle bfloat16 extended instructions when + // SPV_INTEL_bfloat16_arithmetic is enabled. break; } case SPIRV::OpAliasDomainDeclINTEL: @@ -2060,7 +2062,64 @@ void addInstrRequirements(const MachineInstr &MI, Reqs.addCapability(SPIRV::Capability::PredicatedIOINTEL); break; } - + case SPIRV::OpFAddS: + case SPIRV::OpFSubS: + case SPIRV::OpFMulS: + case SPIRV::OpFDivS: + case SPIRV::OpFRemS: + case SPIRV::OpFMod: + case SPIRV::OpFNegate: + case SPIRV::OpFAddV: + case SPIRV::OpFSubV: + case SPIRV::OpFMulV: + case SPIRV::OpFDivV: + case SPIRV::OpFRemV: + case SPIRV::OpFNegateV: { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + SPIRVType *TypeDef = MRI.getVRegDef(MI.getOperand(1).getReg()); + if (TypeDef->getOpcode() == SPIRV::OpTypeVector) + TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg()); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic)) + report_fatal_error( + "Arithmetic instructions with bfloat16 arguments require the " + "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic); + Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL); + } + break; + } + case SPIRV::OpOrdered: + case SPIRV::OpUnordered: + case SPIRV::OpFOrdEqual: + case SPIRV::OpFOrdNotEqual: + case SPIRV::OpFOrdLessThan: + case SPIRV::OpFOrdLessThanEqual: + case SPIRV::OpFOrdGreaterThan: + case SPIRV::OpFOrdGreaterThanEqual: + case SPIRV::OpFUnordEqual: + case SPIRV::OpFUnordNotEqual: + case SPIRV::OpFUnordLessThan: + case SPIRV::OpFUnordLessThanEqual: + case SPIRV::OpFUnordGreaterThan: + case SPIRV::OpFUnordGreaterThanEqual: { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + MachineInstr *OperandDef = MRI.getVRegDef(MI.getOperand(2).getReg()); + SPIRVType *TypeDef = MRI.getVRegDef(OperandDef->getOperand(1).getReg()); + if (TypeDef->getOpcode() == SPIRV::OpTypeVector) + TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg()); + if (isBFloat16Type(TypeDef)) { + if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic)) + report_fatal_error( + "Relational instructions with bfloat16 arguments require the " + "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic", + false); + Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic); + Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL); + } + break; + } default: break; } @@ -2180,6 +2239,10 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI, MAI.Reqs.getAndAddRequirements( SPIRV::OperandCategory::ExecutionModeOperand, SPIRV::ExecutionMode::SubgroupSize, ST); + if (F.getMetadata("max_work_group_size")) + MAI.Reqs.getAndAddRequirements( + SPIRV::OperandCategory::ExecutionModeOperand, + SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, ST); if (F.getMetadata("vec_type_hint")) MAI.Reqs.getAndAddRequirements( SPIRV::OperandCategory::ExecutionModeOperand, diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp index ba09692fec515..ad6c9cd421b7c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp @@ -70,7 +70,6 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, SPIRVVersion = VersionTuple(1, 3); break; case Triple::SPIRVSubArch_v14: - default: SPIRVVersion = VersionTuple(1, 4); break; case Triple::SPIRVSubArch_v15: @@ -79,13 +78,19 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, case Triple::SPIRVSubArch_v16: SPIRVVersion = VersionTuple(1, 6); break; + default: + if (TT.getVendor() == Triple::AMD) + SPIRVVersion = VersionTuple(1, 6); + else + SPIRVVersion = VersionTuple(1, 4); } OpenCLVersion = VersionTuple(2, 2); // Set the environment based on the target triple. if (TargetTriple.getOS() == Triple::Vulkan) Env = Shader; - else if (TargetTriple.getEnvironment() == Triple::OpenCL) + else if (TargetTriple.getEnvironment() == Triple::OpenCL || + TargetTriple.getVendor() == Triple::AMD) Env = Kernel; else Env = Unknown; @@ -93,6 +98,8 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU, // Set the default extensions based on the target triple. if (TargetTriple.getVendor() == Triple::Intel) Extensions.insert(SPIRV::Extension::SPV_INTEL_function_pointers); + if (TargetTriple.getVendor() == Triple::AMD) + Extensions = SPIRVExtensionsParser::getValidExtensions(TargetTriple); // The order of initialization is important. initAvailableExtensions(Extensions); diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 7d08b29a51a6e..1b4b29bbb160a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -387,6 +387,8 @@ defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>; defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>; defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>; defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>; +defm SPV_INTEL_bfloat16_arithmetic + : ExtensionOperand<129, [EnvVulkan, EnvOpenCL]>; //===----------------------------------------------------------------------===// // Multiclass used to define Capabilities enum values and at the same time @@ -570,6 +572,7 @@ defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atom defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variable_length_array], []>; defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>; defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>; +defm BFloat16ArithmeticINTEL : CapabilityOperand<6226, 0, 0, [SPV_INTEL_bfloat16_arithmetic], []>; defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>; defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>; defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>; @@ -587,6 +590,11 @@ defm CooperativeMatrixBFloat16ComponentTypeINTEL : CapabilityOperand<6437, 0, 0, defm RoundToInfinityINTEL : CapabilityOperand<5582, 0, 0, [SPV_INTEL_float_controls2], []>; defm FloatingPointModeINTEL : CapabilityOperand<5583, 0, 0, [SPV_INTEL_float_controls2], []>; defm FunctionFloatControlINTEL : CapabilityOperand<5821, 0, 0, [SPV_INTEL_float_controls2], []>; +defm KernelAttributesINTEL : CapabilityOperand<5892, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// TODO-SPIRV: add these once they are used / tested. +// defm FPGAKernelAttributesINTEL : CapabilityOperand<5897, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// defm FPGAKernelAttributesv2INTEL : CapabilityOperand<6161, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>; +// END TODO-SPIRV defm LongCompositesINTEL : CapabilityOperand<6089, 0, 0, [SPV_INTEL_long_composites], []>; defm BindlessImagesINTEL : CapabilityOperand<6528, 0, 0, [SPV_INTEL_bindless_images], []>; defm MemoryAccessAliasingINTEL : CapabilityOperand<5910, 0, 0, [SPV_INTEL_memory_access_aliasing], []>; @@ -805,6 +813,15 @@ defm RoundingModeRTPINTEL : ExecutionModeOperand<5620, [RoundToInfinityINTEL]>; defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>; defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>; defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>; +defm MaxWorkgroupSizeINTEL : ExecutionModeOperand<5893, [KernelAttributesINTEL]>; +// TODO-SPIRV: Add the following once they are used / tested. +// defm MaxWorkDimINTEL : ExecutionModeOperand<5894, [KernelAttributesINTEL]>; +// defm NoGlobalOffsetINTEL : ExecutionModeOperand<5895, [KernelAttributesINTEL]>; +// defm NumSIMDWorkitemsINTEL : ExecutionModeOperand<5896, [FPGAKernelAttributesINTEL]>; +// defm SchedulerTargetFmaxMhzINTEL : ExecutionModeOperand<5903, [FPGAKernelAttributesINTEL]>; +// defm StreamingInterfaceINTEL : ExecutionModeOperand<6154, [FPGAKernelAttributesv2INTEL]>; +// defm RegisterMapInterfaceINTEL : ExecutionModeOperand<6160, [FPGAKernelAttributesv2INTEL]>; +// END TODO-SPIRV defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>; defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>; @@ -1919,7 +1936,7 @@ defm GenericCastToPtr : SpecConstantOpOperandsOperand<122, [], [Kernel]>; defm PtrCastToGeneric : SpecConstantOpOperandsOperand<121, [], [Kernel]>; defm Bitcast : SpecConstantOpOperandsOperand<124, [], []>; defm QuantizeToF16 : SpecConstantOpOperandsOperand<116, [], [Shader]>; -// Arithmetic +// Arithmetic defm SNegate : SpecConstantOpOperandsOperand<126, [], []>; defm Not : SpecConstantOpOperandsOperand<200, [], []>; defm IAdd : SpecConstantOpOperandsOperand<128, [], []>; diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index 5ba035682238b..2951a4bc695e2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -244,7 +244,8 @@ static cl::opt<bool> SPVEnableNonSemanticDI( cl::Optional, cl::init(false)); void SPIRVPassConfig::addPreEmitPass() { - if (SPVEnableNonSemanticDI) { + if (SPVEnableNonSemanticDI || + getSPIRVTargetMachine().getTargetTriple().getVendor() == Triple::AMD) { addPass(createSPIRVEmitNonSemanticDIPass(&getTM<SPIRVTargetMachine>())); } } diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt index 1e83cbeac50d6..17df119d62709 100644 --- a/llvm/lib/Target/WebAssembly/CMakeLists.txt +++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt @@ -10,6 +10,7 @@ tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel) tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info) tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM WebAssemblyGenSDNodeInfo.inc -gen-sd-node-info) tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget) add_public_tablegen_target(WebAssemblyCommonTableGen) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 2666342d0c7b9..66ed8b078b808 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -46,7 +46,7 @@ class WebAssemblyFastISel final : public FastISel { // All possible address modes. class Address { public: - using BaseKind = enum { RegBase, FrameIndexBase }; + enum BaseKind { RegBase, FrameIndexBase }; private: BaseKind Kind = RegBase; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp index 37a34573bb339..9fef3e6d8b089 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp @@ -24,6 +24,7 @@ #include "WebAssembly.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" @@ -114,6 +115,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Wrapper->setAttributes(F->getAttributes()); BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper); const DataLayout &DL = BB->getDataLayout(); + IRBuilder<> Builder(BB); // Determine what arguments to pass. SmallVector<Value *, 4> Args; @@ -140,10 +142,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Args.push_back(&*AI); } else { if (CastInst::isBitOrNoopPointerCastable(ArgType, ParamType, DL)) { - Instruction *PtrCast = - CastInst::CreateBitOrPointerCast(AI, ParamType, "cast"); - PtrCast->insertInto(BB, BB->end()); - Args.push_back(PtrCast); + Args.push_back(Builder.CreateBitOrPointerCast(AI, ParamType, "cast")); } else if (ArgType->isStructTy() || ParamType->isStructTy()) { LLVM_DEBUG(dbgs() << "createWrapper: struct param type in bitcast: " << F->getName() << "\n"); @@ -166,24 +165,19 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { for (; AI != AE; ++AI) Args.push_back(&*AI); - CallInst *Call = CallInst::Create(F, Args, "", BB); + CallInst *Call = Builder.CreateCall(F, Args); - Type *ExpectedRtnType = F->getFunctionType()->getReturnType(); - Type *RtnType = Ty->getReturnType(); // Determine what value to return. if (RtnType->isVoidTy()) { - ReturnInst::Create(M->getContext(), BB); + Builder.CreateRetVoid(); } else if (ExpectedRtnType->isVoidTy()) { LLVM_DEBUG(dbgs() << "Creating dummy return: " << *RtnType << "\n"); - ReturnInst::Create(M->getContext(), PoisonValue::get(RtnType), BB); + Builder.CreateRet(PoisonValue::get(RtnType)); } else if (RtnType == ExpectedRtnType) { - ReturnInst::Create(M->getContext(), Call, BB); + Builder.CreateRet(Call); } else if (CastInst::isBitOrNoopPointerCastable(ExpectedRtnType, RtnType, DL)) { - Instruction *Cast = - CastInst::CreateBitOrPointerCast(Call, RtnType, "cast"); - Cast->insertInto(BB, BB->end()); - ReturnInst::Create(M->getContext(), Cast, BB); + Builder.CreateRet(Builder.CreateBitOrPointerCast(Call, RtnType, "cast")); } else if (RtnType->isStructTy() || ExpectedRtnType->isStructTy()) { LLVM_DEBUG(dbgs() << "createWrapper: struct return type in bitcast: " << F->getName() << "\n"); @@ -203,9 +197,8 @@ static Function *createWrapper(Function *F, FunctionType *Ty) { Wrapper = Function::Create(Ty, Function::PrivateLinkage, F->getName() + "_bitcast_invalid", M); Wrapper->setAttributes(F->getAttributes()); - BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper); - new UnreachableInst(M->getContext(), BB); - Wrapper->setName(F->getName() + "_bitcast_invalid"); + IRBuilder<> Builder(BasicBlock::Create(M->getContext(), "body", Wrapper)); + Builder.CreateUnreachable(); } else if (!WrapperNeeded) { LLVM_DEBUG(dbgs() << "createWrapper: no wrapper needed: " << F->getName() << "\n"); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def deleted file mode 100644 index 23108e429eda8..0000000000000 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ /dev/null @@ -1,64 +0,0 @@ -//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file describes the various WebAssembly ISD node types. -/// -//===----------------------------------------------------------------------===// - -// NOTE: NO INCLUDE GUARD DESIRED! - -HANDLE_NODETYPE(CALL) -HANDLE_NODETYPE(RET_CALL) -HANDLE_NODETYPE(RETURN) -HANDLE_NODETYPE(ARGUMENT) -HANDLE_NODETYPE(LOCAL_GET) -HANDLE_NODETYPE(LOCAL_SET) -// A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol -HANDLE_NODETYPE(Wrapper) -// A special node for TargetGlobalAddress used in PIC code for -// __memory_base/__table_base relative access. -HANDLE_NODETYPE(WrapperREL) -HANDLE_NODETYPE(BR_IF) -HANDLE_NODETYPE(BR_TABLE) -HANDLE_NODETYPE(DOT) -HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U) -HANDLE_NODETYPE(EXT_ADD_PAIRWISE_S) -HANDLE_NODETYPE(SHUFFLE) -HANDLE_NODETYPE(SWIZZLE) -HANDLE_NODETYPE(VEC_SHL) -HANDLE_NODETYPE(VEC_SHR_S) -HANDLE_NODETYPE(VEC_SHR_U) -HANDLE_NODETYPE(NARROW_U) -HANDLE_NODETYPE(EXTEND_LOW_S) -HANDLE_NODETYPE(EXTEND_LOW_U) -HANDLE_NODETYPE(EXTEND_HIGH_S) -HANDLE_NODETYPE(EXTEND_HIGH_U) -HANDLE_NODETYPE(CONVERT_LOW_S) -HANDLE_NODETYPE(CONVERT_LOW_U) -HANDLE_NODETYPE(PROMOTE_LOW) -HANDLE_NODETYPE(TRUNC_SAT_ZERO_S) -HANDLE_NODETYPE(TRUNC_SAT_ZERO_U) -HANDLE_NODETYPE(DEMOTE_ZERO) -HANDLE_NODETYPE(I64_ADD128) -HANDLE_NODETYPE(I64_SUB128) -HANDLE_NODETYPE(I64_MUL_WIDE_S) -HANDLE_NODETYPE(I64_MUL_WIDE_U) - -// Memory intrinsics -HANDLE_NODETYPE(GLOBAL_GET) -HANDLE_NODETYPE(GLOBAL_SET) -HANDLE_NODETYPE(TABLE_GET) -HANDLE_NODETYPE(TABLE_SET) - -// Bulk memory instructions. These follow LLVM's expected semantics of -// supporting out-of-bounds pointers if the length is zero, by inserting -// a branch around Wasm's `memory.copy` and `memory.fill`, which would -// otherwise trap. -HANDLE_NODETYPE(MEMCPY) -HANDLE_NODETYPE(MEMSET) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 7ec463bdc3b84..af322982d5355 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -942,20 +942,6 @@ MachineBasicBlock *WebAssemblyTargetLowering::EmitInstrWithCustomInserter( } } -const char * -WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) { - case WebAssemblyISD::FIRST_NUMBER: - break; -#define HANDLE_NODETYPE(NODE) \ - case WebAssemblyISD::NODE: \ - return "WebAssemblyISD::" #NODE; -#include "WebAssemblyISD.def" -#undef HANDLE_NODETYPE - } - return nullptr; -} - std::pair<unsigned, const TargetRegisterClass *> WebAssemblyTargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { @@ -1830,11 +1816,8 @@ SDValue WebAssemblyTargetLowering::LowerLoad(SDValue Op, SDValue Idx = DAG.getTargetConstant(*Local, Base, MVT::i32); EVT LocalVT = LN->getValueType(0); - SDValue LocalGet = DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, LocalVT, - {LN->getChain(), Idx}); - SDValue Result = DAG.getMergeValues({LocalGet, LN->getChain()}, DL); - assert(Result->getNumValues() == 2 && "Loads must carry a chain!"); - return Result; + return DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, {LocalVT, MVT::Other}, + {LN->getChain(), Idx}); } if (WebAssembly::isWasmVarAddressSpace(LN->getAddressSpace())) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index 472ec678534a4..f7052989b3c75 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -19,17 +19,6 @@ namespace llvm { -namespace WebAssemblyISD { - -enum NodeType : unsigned { - FIRST_NUMBER = ISD::BUILTIN_OP_END, -#define HANDLE_NODETYPE(NODE) NODE, -#include "WebAssemblyISD.def" -#undef HANDLE_NODETYPE -}; - -} // end namespace WebAssemblyISD - class WebAssemblySubtarget; class WebAssemblyTargetLowering final : public TargetLowering { @@ -53,7 +42,6 @@ class WebAssemblyTargetLowering final : public TargetLowering { MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; - const char *getTargetNodeName(unsigned Opcode) const override; std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp index 2673c81eae40b..cf5cc41ea565b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp @@ -11,23 +11,31 @@ /// //===----------------------------------------------------------------------===// +#include "WebAssemblySelectionDAGInfo.h" #include "WebAssemblyTargetMachine.h" + +#define GET_SDNODE_DESC +#include "WebAssemblyGenSDNodeInfo.inc" + using namespace llvm; #define DEBUG_TYPE "wasm-selectiondag-info" +WebAssemblySelectionDAGInfo::WebAssemblySelectionDAGInfo() + : SelectionDAGGenTargetInfo(WebAssemblyGenSDNodeInfo) {} + WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() = default; // anchor -bool WebAssemblySelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const { +const char * +WebAssemblySelectionDAGInfo::getTargetNodeName(unsigned Opcode) const { switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) { - default: - return false; - case WebAssemblyISD::GLOBAL_GET: - case WebAssemblyISD::GLOBAL_SET: - case WebAssemblyISD::TABLE_GET: - case WebAssemblyISD::TABLE_SET: - return true; + case WebAssemblyISD::CALL: + return "WebAssemblyISD::CALL"; + case WebAssemblyISD::RET_CALL: + return "WebAssemblyISD::RET_CALL"; } + + return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode); } SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy( diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h index 69c9af0966308..8775f4946d88d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h @@ -17,13 +17,26 @@ #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#define GET_SDNODE_ENUM +#include "WebAssemblyGenSDNodeInfo.inc" + namespace llvm { +namespace WebAssemblyISD { + +enum NodeType : unsigned { + CALL = GENERATED_OPCODE_END, + RET_CALL, +}; -class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo { +} // namespace WebAssemblyISD + +class WebAssemblySelectionDAGInfo final : public SelectionDAGGenTargetInfo { public: + WebAssemblySelectionDAGInfo(); + ~WebAssemblySelectionDAGInfo() override; - bool isTargetMemoryOpcode(unsigned Opcode) const override; + const char *getTargetNodeName(unsigned Opcode) const override; SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 51b540a7a51d0..fa23656e23fc3 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -158,7 +158,16 @@ FunctionPass *createX86InsertX87waitPass(); /// This pass optimizes arithmetic based on knowledge that is only used by /// a reduction sequence and is therefore safe to reassociate in interesting /// ways. -FunctionPass *createX86PartialReductionPass(); +class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> { +private: + const X86TargetMachine *TM; + +public: + X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +FunctionPass *createX86PartialReductionLegacyPass(); /// // Analyzes and emits pseudos to support Win x64 Unwind V2. FunctionPass *createX86WinEHUnwindV2Pass(); @@ -179,7 +188,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass(); /// The pass transforms amx intrinsics to scalar operation if the function has /// optnone attribute or it is O0. -FunctionPass *createX86LowerAMXIntrinsicsPass(); +class X86LowerAMXIntrinsicsPass + : public PassInfoMixin<X86LowerAMXIntrinsicsPass> { +private: + const TargetMachine *TM; + +public: + X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); + static bool isRequired() { return true; } +}; + +FunctionPass *createX86LowerAMXIntrinsicsLegacyPass(); InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &, @@ -220,7 +240,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &); void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &); void initializeX86LowerTileCopyPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); -void initializeX86PartialReductionPass(PassRegistry &); +void initializeX86PartialReductionLegacyPass(PassRegistry &); void initializeX86PreTileConfigPass(PassRegistry &); void initializeX86ReturnThunksPass(PassRegistry &); void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index c0c7f5adf06ef..ddbd10d8f7eda 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -272,7 +272,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB, const MachineOperand &Src2 = MI.getOperand(2); bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND; const MCInstrDesc &NewDesc = - ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r); + ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r); if (Is32BitReg) Src1 = getX86SubSuperRegister(Src1, 64); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 133406bd8e0d7..4d44227b3ecd4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Combine sin / cos into _sincos_stret if it is available. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); @@ -30908,6 +30908,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); } + if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) { + // On AVX512BW, we can use variable 16-bit shifts to implement variable + // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi. + // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane + // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors + // can efficiently be merged together using a masked move. + MVT ExtVT = MVT::v32i16; + + SDValue RLo, RHi; + // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and + // right shifting AmtHi. + SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), + DAG.getConstant(0x00ff, dl, ExtVT)); + SDValue AmtHi = getTargetVShiftByConstNode( + X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG); + switch (Opc) { + case ISD::SHL: + // Because we shift left, no bits from the high half can influence the low + // half, so we don't need to mask RLo. We do however need to mask RHi, to + // prevent high bits of an even lane overflowing into low bits of an odd + // lane. + RLo = DAG.getBitcast(ExtVT, R); + RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo, + DAG.getConstant(0xff00, dl, ExtVT)); + break; + case ISD::SRL: + // Same idea as above, but this time we need to make sure no low bits of + // an odd lane can overflow into high bits of an even lane. + RHi = DAG.getBitcast(ExtVT, R); + RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi, + DAG.getConstant(0x00ff, dl, ExtVT)); + break; + case ISD::SRA: + // For arithmetic right shifts, we want to sign extend each even lane of R + // such that the upper half of the corresponding lane of RLo is 0 or -1 + // depending on the sign bit of the original lane. We do this using 2 + // immediate shifts. + RHi = DAG.getBitcast(ExtVT, R); + RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG); + RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG); + break; + default: + llvm_unreachable("Unexpected Shift Op"); + } + + SDValue ShiftedLo = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo)); + SDValue ShiftedHi = + DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi)); + + // To merge the shifted vectors back together, we select even lanes + // from ShiftedLo and odd lanes from ShiftedHi. + SDValue SelectMask = DAG.getBitcast( + MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64)); + return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi); + } + if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) { @@ -33004,60 +33061,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } -static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - bool isF64 = ArgVT == MVT::f64; - - RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; - const char *LibcallName = TLI.getLibcallName(LC); - if (!LibcallName) - return SDValue(); - - assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit()); - - // For MacOSX, we want to call an alternative entry point: __sincos_stret, - // which returns the values as { float, float } (in XMM0) or - // { double, double } (which is returned in XMM0, XMM1). - SDLoc dl(Op); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - TargetLowering::ArgListTy Args; - Args.emplace_back(Arg, ArgTy); - - // Only optimize x86_64 for now. i386 is a bit messy. For f32, - // the small struct {f32, f32} is returned in (eax, edx). For f64, - // the results are returned via SRet in memory. - SDValue Callee = - DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - - Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) - : (Type *)FixedVectorType::get(ArgTy, 4); - - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl) - .setChain(DAG.getEntryNode()) - .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); - - std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); - - if (isF64) - // Returned in xmm0 and xmm1. - return CallResult.first; - - // Returned in bits 0:31 and 32:64 xmm0. - SDValue SinVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(0, dl)); - SDValue CosVal = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first, - DAG.getVectorIdxConstant(1, dl)); - SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); - return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); -} - /// Widen a vector input to a vector of NVT. The /// input vector must have the same element type as NVT. static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, @@ -33662,7 +33665,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ABDS: case ISD::ABDU: return LowerABD(Op, Subtarget, DAG); case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG); - case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); @@ -53347,6 +53349,112 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Look for a RMW operation that only touches one bit of a larger than legal +// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single +// i32 sub value. +static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; + SDValue StoredVal = St->getValue(); + EVT VT = StoredVal.getValueType(); + + // Only narrow normal stores of larger than legal scalar integers. + if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() || + VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32)) + return SDValue(); + + // BTR: X & ~(1 << ShAmt) + // BTS: X | (1 << ShAmt) + // BTC: X ^ (1 << ShAmt) + // + // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt) + SDValue SrcVal, InsertBit, ShAmt; + if (!(sd_match(StoredVal, m_And(m_Value(SrcVal), + m_Not(m_Shl(m_One(), m_Value(ShAmt))))) || + sd_match(StoredVal, + m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match(StoredVal, + m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) || + sd_match( + StoredVal, + m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))), + m_Shl(m_Value(InsertBit), m_Deferred(ShAmt)))))) + return SDValue(); + + // SrcVal must be a matching normal load further up the chain. + auto *Ld = dyn_cast<LoadSDNode>(SrcVal); + if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() || + Ld->getBasePtr() != St->getBasePtr() || + Ld->getOffset() != St->getOffset() || + !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1))) + return SDValue(); + + // Ensure the shift amount is in bounds. + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + if (KnownAmt.getMaxValue().uge(VT.getSizeInBits())) + return SDValue(); + + // If we're inserting a bit then it must be the LSB. + if (InsertBit) { + KnownBits KnownInsert = DAG.computeKnownBits(InsertBit); + if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1)) + return SDValue(); + } + + // Split the shift into an alignment shift that moves the active i32 block to + // the bottom bits for truncation and a modulo shift that can act on the i32. + EVT AmtVT = ShAmt.getValueType(); + SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getSignedConstant(-32LL, DL, AmtVT)); + SDValue ModuloAmt = + DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT)); + ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8); + + // Compute the byte offset for the i32 block that is changed by the RMW. + // combineTruncate will adjust the load for us in a similar way. + EVT PtrVT = St->getBasePtr().getValueType(); + SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT); + SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs, + DAG.getShiftAmountConstant(3, PtrVT, DL)); + SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL, + SDNodeFlags::NoUnsignedWrap); + + // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store. + SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt); + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + + SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(1, DL, MVT::i32), ModuloAmt); + + SDValue Res; + if (InsertBit) { + SDValue BitMask = + DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt); + Res = + DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32)); + Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask); + } else { + if (StoredVal.getOpcode() == ISD::AND) + Mask = DAG.getNOT(DL, Mask, MVT::i32); + Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask); + } + + SDValue NewStore = + DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(), + Align(), St->getMemOperand()->getFlags()); + + // If there are other uses of StoredVal, replace with a new load of the + // whole (updated) value. + if (!StoredVal.hasOneUse()) { + SDValue NewLoad = + DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand()); + DAG.ReplaceAllUsesWith(StoredVal, NewLoad); + } + return NewStore; +} + static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -53573,6 +53681,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } } + if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget)) + return R; + // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC) // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC) if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && @@ -54505,8 +54616,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, // truncation, see if we can convert the shift into a pointer offset instead. // Limit this to normal (non-ext) scalar integer loads. if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL && - Src.hasOneUse() && Src.getOperand(0).hasOneUse() && - ISD::isNormalLoad(Src.getOperand(0).getNode())) { + Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) && + (Src.getOperand(0).hasOneUse() || + !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) { auto *Ld = cast<LoadSDNode>(Src.getOperand(0)); if (Ld->isSimple() && VT.isByteSized() && isPowerOf2_64(VT.getSizeInBits())) { @@ -54529,8 +54641,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, SDValue NewLoad = DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(), Align(), Ld->getMemOperand()->getFlags()); - DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1), - NewLoad.getValue(1)); + DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); return NewLoad; } } @@ -56306,6 +56417,7 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); const SDValue LHS = N->getOperand(0); const SDValue RHS = N->getOperand(1); @@ -56364,6 +56476,37 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, if (SDValue AndN = MatchAndCmpEq(RHS, LHS)) return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); + // If we're performing a bit test on a larger than legal type, attempt + // to (aligned) shift down the value to the bottom 32-bits and then + // perform the bittest on the i32 value. + // ICMP_ZERO(AND(X,SHL(1,IDX))) + // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31)))) + if (isNullConstant(RHS) && + OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) { + SDValue X, ShAmt; + if (sd_match(LHS, m_OneUse(m_And(m_Value(X), + m_Shl(m_One(), m_Value(ShAmt)))))) { + // Only attempt this if the shift amount is known to be in bounds. + KnownBits KnownAmt = DAG.computeKnownBits(ShAmt); + if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) { + EVT AmtVT = ShAmt.getValueType(); + SDValue AlignAmt = + DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getSignedConstant(-32LL, DL, AmtVT)); + SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, + DAG.getConstant(31, DL, AmtVT)); + SDValue Mask = DAG.getNode( + ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), + DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8)); + X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt); + X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); + X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask); + return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32), + CC); + } + } + } + // cmpeq(trunc(x),C) --> cmpeq(x,C) // cmpne(trunc(x),C) --> cmpne(x,C) // iff x upper bits are zero. diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 7f3393910da2c..662aec2c15241 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -23,12 +23,15 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -40,7 +43,7 @@ using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "lower-amx-intrinsics" +#define DEBUG_TYPE "x86-lower-amx-intrinsics" #ifndef NDEBUG static bool isV256I32Ty(Type *Ty) { @@ -626,6 +629,37 @@ bool X86LowerAMXIntrinsics::visit() { return C; } +namespace { +bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) { + return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) || + TM->getOptLevel() == CodeGenOptLevel::None); +} + +bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) { + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + + X86LowerAMXIntrinsics LAT(F, DTU, LI); + return LAT.visit(); +} +} // namespace + +PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &FAM) { + if (!shouldRunLowerAMXIntrinsics(F, TM)) + return PreservedAnalyses::all(); + + DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); + LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); + bool Changed = runLowerAMXIntrinsics(F, &DT, &LI); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LoopAnalysis>(); + return PA; +} + namespace { class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { public: @@ -634,21 +668,15 @@ class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override { - if (!X86ScalarizeAMX) - return false; TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); - if (!F.hasFnAttribute(Attribute::OptimizeNone) && - TM->getOptLevel() != CodeGenOptLevel::None) + if (!shouldRunLowerAMXIntrinsics(F, TM)) return false; auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - - X86LowerAMXIntrinsics LAT(F, DTU, LI); - return LAT.visit(); + return runLowerAMXIntrinsics(F, DT, LI); } StringRef getPassName() const override { return "Lower AMX intrinsics"; } @@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) -FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() { +FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() { return new X86LowerAMXIntrinsicsLegacyPass(); } diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index a25e4e0f464a4..898c83cf9b468 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -16,10 +16,12 @@ #include "X86TargetMachine.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Analysis.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" @@ -30,39 +32,44 @@ using namespace llvm; namespace { -class X86PartialReduction : public FunctionPass { +class X86PartialReduction { + const X86TargetMachine *TM; const DataLayout *DL = nullptr; const X86Subtarget *ST = nullptr; +public: + X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {} + bool run(Function &F); + +private: + bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); + bool trySADReplacement(Instruction *Op); +}; + +class X86PartialReductionLegacy : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. - X86PartialReduction() : FunctionPass(ID) { } + X86PartialReductionLegacy() : FunctionPass(ID) {} - bool runOnFunction(Function &Fn) override; + bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } - StringRef getPassName() const override { - return "X86 Partial Reduction"; - } - -private: - bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB); - bool trySADReplacement(Instruction *Op); + StringRef getPassName() const override { return "X86 Partial Reduction"; } }; } -FunctionPass *llvm::createX86PartialReductionPass() { - return new X86PartialReduction(); +FunctionPass *llvm::createX86PartialReductionLegacyPass() { + return new X86PartialReductionLegacy(); } -char X86PartialReduction::ID = 0; +char X86PartialReductionLegacy::ID = 0; -INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, - "X86 Partial Reduction", false, false) +INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction", + false, false) // This function should be aligned with detectExtMul() in X86ISelLowering.cpp. static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul, @@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { } } -bool X86PartialReduction::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - if (!TPC) - return false; - - auto &TM = TPC->getTM<X86TargetMachine>(); - ST = TM.getSubtargetImpl(F); - +bool X86PartialReduction::run(Function &F) { + ST = TM->getSubtargetImpl(F); DL = &F.getDataLayout(); bool MadeChange = false; @@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) { return MadeChange; } + +bool X86PartialReductionLegacy::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F); +} + +PreservedAnalyses X86PartialReductionPass::run(Function &F, + FunctionAnalysisManager &FAM) { + bool Changed = X86PartialReduction(TM).run(F); + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = PreservedAnalyses::none(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def index fc25d55d3059a..db255940f8829 100644 --- a/llvm/lib/Target/X86/X86PassRegistry.def +++ b/llvm/lib/Target/X86/X86PassRegistry.def @@ -15,14 +15,14 @@ #ifndef FUNCTION_PASS #define FUNCTION_PASS(NAME, CREATE_PASS) #endif +FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this)) FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this)) +FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this)) #undef FUNCTION_PASS #ifndef DUMMY_FUNCTION_PASS #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS) #endif -DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this)) -DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction()) DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass()) #undef DUMMY_FUNCTION_PASS diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 9a76abcd351bf..5f0bcab251e61 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -97,7 +97,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() { initializeX86LoadValueInjectionLoadHardeningPassPass(PR); initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); - initializeX86PartialReductionPass(PR); + initializeX86PartialReductionLegacyPass(PR); initializePseudoProbeInserterPass(PR); initializeX86ReturnThunksPass(PR); initializeX86DAGToDAGISelLegacyPass(PR); @@ -422,14 +422,14 @@ void X86PassConfig::addIRPasses() { // We add both pass anyway and when these two passes run, we skip the pass // based on the option level and option attribute. - addPass(createX86LowerAMXIntrinsicsPass()); + addPass(createX86LowerAMXIntrinsicsLegacyPass()); addPass(createX86LowerAMXTypeLegacyPass()); TargetPassConfig::addIRPasses(); if (TM->getOptLevel() != CodeGenOptLevel::None) { addPass(createInterleavedAccessPass()); - addPass(createX86PartialReductionPass()); + addPass(createX86PartialReductionLegacyPass()); } // Add passes that handle indirect branch removal and insertion of a retpoline diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp index f6f7e92d98578..2f28ab36aa193 100644 --- a/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -66,7 +66,7 @@ namespace { MachineBasicBlock &MBB); void addDirtySuccessor(MachineBasicBlock &MBB); - using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; + enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; static const char* getBlockExitStateName(BlockExitState ST); diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 5ed47aec08b25..a6ac7610a2c7a 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final // ------------------------ Align Argument Attribute ------------------------ namespace { + static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, Value &AssociatedValue, const Use *U, const Instruction *I, bool &TrackUse) { @@ -5200,6 +5201,28 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA, TrackUse = true; return 0; } + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + // Is it appropriate to pull attribute in initialization? + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE); + const auto *AlignAA = A.getAAFor<AAAlign>( + QueryingAA, IRPosition::value(*II), DepClassTy::NONE); + if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) { + unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Align ConstAlign(UINT64_C(1) << ShiftValue); + if (ConstAlign >= AlignAA->getKnownAlign()) + return Align(1).value(); + } + if (AlignAA) + return AlignAA->getKnownAlign().value(); + break; + } + default: + break; + } MaybeAlign MA; if (const auto *CB = dyn_cast<CallBase>(I)) { @@ -5499,6 +5522,44 @@ struct AAAlignCallSiteReturned final AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {} + ChangeStatus updateImpl(Attributor &A) override { + Instruction *I = getIRPosition().getCtxI(); + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::ptrmask: { + Align Alignment; + bool Valid = false; + + const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>( + *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED); + if (ConstVals && ConstVals->isValidState()) { + unsigned ShiftValue = + std::min(ConstVals->getAssumedMinTrailingZeros(), + Value::MaxAlignmentExponent); + Alignment = Align(UINT64_C(1) << ShiftValue); + Valid = true; + } + + const auto *AlignAA = + A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))), + DepClassTy::REQUIRED); + if (AlignAA && AlignAA->isValidState()) { + Alignment = std::max(AlignAA->getAssumedAlign(), Alignment); + Valid = true; + } + + if (Valid) + return clampStateAndIndicateChange<StateType>( + this->getState(), + std::min(this->getAssumedAlign(), Alignment).value()); + break; + } + default: + break; + } + } + return Base::updateImpl(A); + }; /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); } }; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index f5130da818746..9572f9d702e1b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3599,6 +3599,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelCond->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Specific(C), m_Value(A2))) && + SelCond) { + return SelectInst::Create(C, A, B, "", nullptr, SelCond); + } else if (match(FalseVal, + m_LogicalAnd(m_Not(m_Value(C2)), m_Value(B2))) && + SelFVal) { + SelectInst *NewSI = SelectInst::Create(C, A, B, "", nullptr, SelFVal); + NewSI->swapProfMetadata(); + return NewSI; + } else { + return createSelectInstWithUnknownProfile(C, A, B); + } + } return SelectInst::Create(C, A, B); } @@ -3615,6 +3630,20 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { m_Not(m_Specific(SelFVal->getTrueValue()))); if (MayNeedFreeze) C = Builder.CreateFreeze(C); + if (!ProfcheckDisableMetadataFixes) { + Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr; + if (match(CondVal, m_LogicalAnd(m_Not(m_Value(C2)), m_Value(A2))) && + SelCond) { + SelectInst *NewSI = SelectInst::Create(C, B, A, "", nullptr, SelCond); + NewSI->swapProfMetadata(); + return NewSI; + } else if (match(FalseVal, m_LogicalAnd(m_Specific(C), m_Value(B2))) && + SelFVal) { + return SelectInst::Create(C, B, A, "", nullptr, SelFVal); + } else { + return createSelectInstWithUnknownProfile(C, B, A); + } + } return SelectInst::Create(C, B, A); } } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 67f837c7ed968..b158e0f626850 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2261,11 +2261,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) { } Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) { - if (!isa<Constant>(I.getOperand(1))) - return nullptr; + bool IsOtherParamConst = isa<Constant>(I.getOperand(1)); if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) { - if (Instruction *NewSel = FoldOpIntoSelect(I, Sel)) + if (Instruction *NewSel = + FoldOpIntoSelect(I, Sel, false, !IsOtherParamConst)) return NewSel; } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) { if (Instruction *NewPhi = foldOpIntoPhi(I, PN)) diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 471c6ec633a57..ceeece41782f4 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3903,7 +3903,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // adding/"accumulating" %s. "Accumulation" stores the result in one // of the source registers, but this accumulate vs. add distinction // is lost when dealing with LLVM intrinsics.) + // + // ZeroPurifies means that multiplying a known-zero with an uninitialized + // value results in an initialized value. This is applicable for integer + // multiplication, but not floating-point (counter-example: NaN). void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor, + bool ZeroPurifies, unsigned EltSizeInBits = 0) { IRBuilder<> IRB(&I); @@ -3945,7 +3950,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { assert(AccumulatorType == ReturnType); } - FixedVectorType *ImplicitReturnType = ReturnType; + FixedVectorType *ImplicitReturnType = + cast<FixedVectorType>(getShadowTy(ReturnType)); // Step 1: instrument multiplication of corresponding vector elements if (EltSizeInBits) { ImplicitReturnType = cast<FixedVectorType>( @@ -3964,30 +3970,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { ReturnType->getNumElements() * ReductionFactor); } - // Multiplying an *initialized* zero by an uninitialized element results in - // an initialized zero element. - // - // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value - // results in an unpoisoned value. We can therefore adapt the visitAnd() - // instrumentation: - // OutShadow = (SaNonZero & SbNonZero) - // | (VaNonZero & SbNonZero) - // | (SaNonZero & VbNonZero) - // where non-zero is checked on a per-element basis (not per bit). - Value *SZero = Constant::getNullValue(Va->getType()); - Value *VZero = Constant::getNullValue(Sa->getType()); - Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero); - Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero); - Value *VaNonZero = IRB.CreateICmpNE(Va, VZero); - Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero); - - Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); - Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); - Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); - // Each element of the vector is represented by a single bit (poisoned or // not) e.g., <8 x i1>. - Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + Value *SaNonZero = IRB.CreateIsNotNull(Sa); + Value *SbNonZero = IRB.CreateIsNotNull(Sb); + Value *And; + if (ZeroPurifies) { + // Multiplying an *initialized* zero by an uninitialized element results + // in an initialized zero element. + // + // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value + // results in an unpoisoned value. We can therefore adapt the visitAnd() + // instrumentation: + // OutShadow = (SaNonZero & SbNonZero) + // | (VaNonZero & SbNonZero) + // | (SaNonZero & VbNonZero) + // where non-zero is checked on a per-element basis (not per bit). + Value *VaInt = Va; + Value *VbInt = Vb; + if (!Va->getType()->isIntegerTy()) { + VaInt = CreateAppToShadowCast(IRB, Va); + VbInt = CreateAppToShadowCast(IRB, Vb); + } + + Value *VaNonZero = IRB.CreateIsNotNull(VaInt); + Value *VbNonZero = IRB.CreateIsNotNull(VbInt); + + Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); + Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); + Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); + + And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + } else { + And = IRB.CreateOr({SaNonZero, SbNonZero}); + } // Extend <8 x i1> to <8 x i16>. // (The real pmadd intrinsic would have computed intermediate values of @@ -5752,17 +5768,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_ssse3_pmadd_ub_sw_128: case Intrinsic::x86_avx2_pmadd_ub_sw: case Intrinsic::x86_avx512_pmaddubs_w_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true); break; // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) case Intrinsic::x86_ssse3_pmadd_ub_sw: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) case Intrinsic::x86_mmx_pmadd_wd: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // AVX Vector Neural Network Instructions: bytes @@ -5848,7 +5867,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx2_vpdpbuuds_128: case Intrinsic::x86_avx2_vpdpbuuds_256: case Intrinsic::x86_avx10_vpdpbuuds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/8); break; // AVX Vector Neural Network Instructions: words @@ -5901,7 +5921,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_avx512_vpdpwssds_128: case Intrinsic::x86_avx512_vpdpwssds_256: case Intrinsic::x86_avx512_vpdpwssds_512: - handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, + /*ZeroPurifies=*/true, /*EltSizeInBits=*/16); break; // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 19eccb9e17020..9ffa602416b05 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -1796,14 +1796,16 @@ struct LoopFuser { // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { @@ -2092,14 +2094,16 @@ struct LoopFuser { // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); - // Forget block dispositions as well, so that there are no dangling - // pointers to erased/free'ed blocks. - SE.forgetBlockAndLoopDispositions(); // Move instructions from FC0.Latch to FC1.Latch. // Note: mergeLatch requires an updated DT. mergeLatch(FC0, FC1); + // Forget block dispositions as well, so that there are no dangling + // pointers to erased/free'ed blocks. It should be done after mergeLatch() + // since merging the latches may affect the dispositions. + SE.forgetBlockAndLoopDispositions(); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); for (BasicBlock *BB : Blocks) { diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index a8839981e5478..1b770be3909a9 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate { /// Return true if the dependence from the store to the load has an /// absolute distance of one. /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop) - bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, - Loop *L) const { + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L, + const DominatorTree &DT) const { Value *LoadPtr = Load->getPointerOperand(); Value *StorePtr = Store->getPointerOperand(); Type *LoadType = getLoadStoreType(Load); @@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate { DL.getTypeSizeInBits(getLoadStoreType(Store)) && "Should be a known dependence"); - int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0); - int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0); + int64_t StrideLoad = + getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0); + int64_t StrideStore = + getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0); if (!StrideLoad || !StrideStore || StrideLoad != StrideStore) return false; @@ -287,8 +289,8 @@ class LoadEliminationForLoop { // so deciding which one forwards is easy. The later one forwards as // long as they both have a dependence distance of one to the load. if (Cand.Store->getParent() == OtherCand->Store->getParent() && - Cand.isDependenceDistanceOfOne(PSE, L) && - OtherCand->isDependenceDistanceOfOne(PSE, L)) { + Cand.isDependenceDistanceOfOne(PSE, L, *DT) && + OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) { // They are in the same block, the later one will forward to the load. if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) OtherCand = &Cand; @@ -538,7 +540,7 @@ class LoadEliminationForLoop { // Check whether the SCEV difference is the same as the induction step, // thus we load the value in the next iteration. - if (!Cand.isDependenceDistanceOfOne(PSE, L)) + if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT)) continue; assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) && diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 2bda9d83236e8..802ae4e9c28e3 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, } // Do not attempt partial/runtime unrolling in FullLoopUnrolling - if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) { + if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) || + UP.Count < TripCount || UP.Count < MaxTripCount)) { LLVM_DEBUG( dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n"); return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index bb6c879f4d47e..0577ddbd2353c 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ProfDataUtils.h" @@ -337,7 +338,7 @@ static void buildPartialUnswitchConditionalBranch( static void buildPartialInvariantUnswitchConditionalBranch( BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction, BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) { ValueToValueMapTy VMap; for (auto *Val : reverse(ToDuplicate)) { Instruction *Inst = cast<Instruction>(Val); @@ -377,8 +378,19 @@ static void buildPartialInvariantUnswitchConditionalBranch( IRBuilder<> IRB(&BB); IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); Value *Cond = VMap[ToDuplicate[0]]; - IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, - Direction ? &NormalSucc : &UnswitchedSucc); + // The expectation is that ToDuplicate[0] is the condition used by the + // OriginalBranch, case in which we can clone the profile metadata from there. + auto *ProfData = + !ProfcheckDisableMetadataFixes && + ToDuplicate[0] == skipTrivialSelect(OriginalBranch.getCondition()) + ? OriginalBranch.getMetadata(LLVMContext::MD_prof) + : nullptr; + auto *BR = + IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, + Direction ? &NormalSucc : &UnswitchedSucc, ProfData); + if (!ProfData) + setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(), + DEBUG_TYPE); } /// Rewrite the PHI nodes in an unswitched loop exit basic block. @@ -2515,7 +2527,7 @@ static void unswitchNontrivialInvariants( // the branch in the split block. if (PartiallyInvariant) buildPartialInvariantUnswitchConditionalBranch( - *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU); + *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU, *BI); else { buildPartialUnswitchConditionalBranch( *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, @@ -2820,9 +2832,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, MSSAU->getMemorySSA()->verifyMemorySSA(); DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - Instruction *DeoptBlockTerm = - SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true, - GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI); + // llvm.experimental.guard doesn't have branch weights. We can assume, + // however, that the deopt path is unlikely. + Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen( + GI->getArgOperand(0), GI, true, + !ProfcheckDisableMetadataFixes && EstimateProfile + ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights() + : nullptr, + &DTU, &LI); BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator()); // SplitBlockAndInsertIfThen inserts control flow that branches to // DeoptBlockTerm if the condition is true. We want the opposite. @@ -3186,10 +3203,15 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, Builder.SetInsertPoint(TI); auto *InvariantBr = Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock); + // We don't know anything about the relation between the limits. + setExplicitlyUnknownBranchWeightsIfProfiled( + *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE); Builder.SetInsertPoint(CheckBlock); - Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0), - TI->getSuccessor(1)); + Builder.CreateCondBr( + TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1), + !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof) + : nullptr); TI->eraseFromParent(); // Fixup phis. diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 5f6f66a4bc213..0a8f5ea2fdae1 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { } else { // Test for successors as back edge BasicBlock *BB = N->getNodeAs<BasicBlock>(); - BranchInst *Term = cast<BranchInst>(BB->getTerminator()); - - for (BasicBlock *Succ : Term->successors()) - if (Visited.count(Succ)) - Loops[Succ] = BB; + if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator())) + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) + Loops[Succ] = BB; } } @@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) { for (BasicBlock *P : predecessors(BB)) { // Ignore it if it's a branch from outside into our region entry - if (!ParentRegion->contains(P)) + if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator())) continue; Region *R = RI->getRegionFor(P); @@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) { /// Run the transformation for each region found bool StructurizeCFG::run(Region *R, DominatorTree *DT, const TargetTransformInfo *TTI) { - if (R->isTopLevelRegion()) + // CallBr and its corresponding direct target blocks are for now ignored by + // this pass. This is not a limitation for the currently intended uses cases + // of callbr in the AMDGPU backend. + // Parent and child regions are not affected by this (current) restriction. + // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details. + if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator())) return false; this->DT = DT; this->TTI = TTI; Func = R->getEntry()->getParent(); - assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator."); ParentRegion = R; diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 5ba6f95f5fae8..608661583c3db 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration( case Attribute::CoroDestroyOnlyWhenComplete: case Attribute::CoroElideSafe: case Attribute::NoDivergenceSource: + case Attribute::NoCreateUndefOrPoison: continue; // Those attributes should be safe to propagate to the extracted function. case Attribute::AlwaysInline: diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp index 0642d51cd2c21..6d4436b92c119 100644 --- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp +++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp @@ -16,22 +16,62 @@ using namespace llvm; +static void mergeAttributes(LLVMContext &Ctx, const Module &M, + const DataLayout &DL, const Triple &TT, + Function *Func, FunctionType *FuncTy, + AttributeList FuncAttrs) { + AttributeList OldAttrs = Func->getAttributes(); + AttributeList NewAttrs = OldAttrs; + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder); + } + + { + AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs()); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs()); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder); + } + + for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) { + AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I)); + AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I)); + OldBuilder.merge(NewBuilder); + NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder); + } + + Func->setAttributes(NewAttrs); +} + PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M, ModuleAnalysisManager &MAM) { RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple()); LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + const Triple &TT = M.getTargetTriple(); for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) { if (Impl == RTLIB::Unsupported) continue; - // TODO: Declare with correct type, calling convention, and attributes. + auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl); - FunctionType *FuncTy = - FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); + // TODO: Declare with correct type, calling convention, and attributes. + if (!FuncTy) + FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true); StringRef FuncName = RTLCI.getLibcallImplName(Impl); - M.getOrInsertFunction(FuncName, FuncTy); + + Function *Func = + cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee()); + if (Func->getFunctionType() == FuncTy) { + mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs); + Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl)); + } } return PreservedAnalyses::none(); diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 46f29030ddb05..a03cf6e953e35 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3416,7 +3416,11 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C, // Create integer constant expression. auto createIntegerExpression = [&DIB](const Constant &CV) -> DIExpression * { const APInt &API = cast<ConstantInt>(&CV)->getValue(); - std::optional<int64_t> InitIntOpt = API.trySExtValue(); + std::optional<int64_t> InitIntOpt; + if (API.getBitWidth() == 1) + InitIntOpt = API.tryZExtValue(); + else + InitIntOpt = API.trySExtValue(); return InitIntOpt ? DIB.createConstantValueExpression( static_cast<uint64_t>(*InitIntOpt)) : nullptr; diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 1e8f6cc76900c..6c9467bf4a005 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -202,6 +202,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, /// probability of executing at least one more iteration? static BranchProbability probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) { + // OriginalLoopProb == 1 would produce a division by zero in the calculation + // below. The problem is that case indicates an always infinite loop, but a + // remainder loop cannot be calculated at run time if the original loop is + // infinite as infinity % UnrollCount is undefined. We then choose + // probabilities indicating that all remainder loop iterations will always + // execute. + // + // Currently, the remainder loop here is an epilogue, which cannot be reached + // if the original loop is infinite, so the aforementioned choice is + // arbitrary. + // + // FIXME: Branch weights still need to be fixed in the case of prologues + // (issue #135812). In that case, the aforementioned choice seems reasonable + // for the goal of maintaining the original loop's block frequencies. That + // is, an infinite loop's initial iterations are not skipped, and the prologue + // loop body might have unique blocks that execute a finite number of times + // if, for example, the original loop body contains conditionals like i < + // UnrollCount. + if (OriginalLoopProb == BranchProbability::getOne()) + return BranchProbability::getOne(); + // Each of these variables holds the original loop's probability that the // number of iterations it will execute is some m in the specified range. BranchProbability ProbOne = OriginalLoopProb; // 1 <= m diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 8be471bee5579..6e60b94be78e3 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -992,9 +992,12 @@ BranchProbability llvm::getBranchProbability(BranchInst *B, uint64_t Weight0, Weight1; if (!extractBranchWeights(*B, Weight0, Weight1)) return BranchProbability::getUnknown(); + uint64_t Denominator = Weight0 + Weight1; + if (Denominator == 0) + return BranchProbability::getUnknown(); if (!ForFirstTarget) std::swap(Weight0, Weight1); - return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1); + return BranchProbability::getBranchProbability(Weight0, Denominator); } bool llvm::setBranchProbability(BranchInst *B, BranchProbability P, diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index cbc604e87cf1a..3a3e3ade20212 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -778,8 +778,10 @@ struct ConstantComparesGatherer { return false; // Add all values from the range to the set - for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp) + APInt Tmp = Span.getLower(); + do Vals.push_back(ConstantInt::get(I->getContext(), Tmp)); + while (++Tmp != Span.getUpper()); UsedICmps++; return true; @@ -6020,6 +6022,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const DataLayout &DL) { Value *Cond = SI->getCondition(); KnownBits Known = computeKnownBits(Cond, DL, AC, SI); + SmallPtrSet<const Constant *, 4> KnownValues; + bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4); // We can also eliminate cases by determining that their values are outside of // the limited range of the condition based on how many significant (non-sign) @@ -6039,15 +6043,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, UniqueSuccessors.push_back(Successor); ++It->second; } - const APInt &CaseVal = Case.getCaseValue()->getValue(); + ConstantInt *CaseC = Case.getCaseValue(); + const APInt &CaseVal = CaseC->getValue(); if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) || - (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) { - DeadCases.push_back(Case.getCaseValue()); + (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) || + (IsKnownValuesValid && !KnownValues.contains(CaseC))) { + DeadCases.push_back(CaseC); if (DTU) --NumPerSuccessorCases[Successor]; LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n"); - } + } else if (IsKnownValuesValid) + KnownValues.erase(CaseC); } // If we can prove that the cases must cover all possible values, the @@ -6058,33 +6065,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, const unsigned NumUnknownBits = Known.getBitWidth() - (Known.Zero | Known.One).popcount(); assert(NumUnknownBits <= Known.getBitWidth()); - if (HasDefault && DeadCases.empty() && - NumUnknownBits < 64 /* avoid overflow */) { - uint64_t AllNumCases = 1ULL << NumUnknownBits; - if (SI->getNumCases() == AllNumCases) { + if (HasDefault && DeadCases.empty()) { + if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) { createUnreachableSwitchDefault(SI, DTU); return true; } - // When only one case value is missing, replace default with that case. - // Eliminating the default branch will provide more opportunities for - // optimization, such as lookup tables. - if (SI->getNumCases() == AllNumCases - 1) { - assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); - IntegerType *CondTy = cast<IntegerType>(Cond->getType()); - if (CondTy->getIntegerBitWidth() > 64 || - !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) - return false; - uint64_t MissingCaseVal = 0; - for (const auto &Case : SI->cases()) - MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); - auto *MissingCase = - cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal)); - SwitchInstProfUpdateWrapper SIW(*SI); - SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0)); - createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false); - SIW.setSuccessorWeight(0, 0); - return true; + if (NumUnknownBits < 64 /* avoid overflow */) { + uint64_t AllNumCases = 1ULL << NumUnknownBits; + if (SI->getNumCases() == AllNumCases) { + createUnreachableSwitchDefault(SI, DTU); + return true; + } + // When only one case value is missing, replace default with that case. + // Eliminating the default branch will provide more opportunities for + // optimization, such as lookup tables. + if (SI->getNumCases() == AllNumCases - 1) { + assert(NumUnknownBits > 1 && "Should be canonicalized to a branch"); + IntegerType *CondTy = cast<IntegerType>(Cond->getType()); + if (CondTy->getIntegerBitWidth() > 64 || + !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth())) + return false; + + uint64_t MissingCaseVal = 0; + for (const auto &Case : SI->cases()) + MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue(); + auto *MissingCase = cast<ConstantInt>( + ConstantInt::get(Cond->getType(), MissingCaseVal)); + SwitchInstProfUpdateWrapper SIW(*SI); + SIW.addCase(MissingCase, SI->getDefaultDest(), + SIW.getSuccessorWeight(0)); + createUnreachableSwitchDefault(SI, DTU, + /*RemoveOrigDefaultBlock*/ false); + SIW.setSuccessorWeight(0, 0); + return true; + } } } @@ -7570,6 +7585,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder, return true; } +/// Tries to transform the switch when the condition is umin with a constant. +/// In that case, the default branch can be replaced by the constant's branch. +/// This method also removes dead cases when the simplification cannot replace +/// the default branch. +/// +/// For example: +/// switch(umin(a, 3)) { +/// case 0: +/// case 1: +/// case 2: +/// case 3: +/// case 4: +/// // ... +/// default: +/// unreachable +/// } +/// +/// Transforms into: +/// +/// switch(a) { +/// case 0: +/// case 1: +/// case 2: +/// default: +/// // This is case 3 +/// } +static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) { + Value *A; + ConstantInt *Constant; + + if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant)))) + return false; + + SmallVector<DominatorTree::UpdateType> Updates; + SwitchInstProfUpdateWrapper SIW(*SI); + BasicBlock *BB = SIW->getParent(); + + // Dead cases are removed even when the simplification fails. + // A case is dead when its value is higher than the Constant. + for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) { + if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) { + ++I; + continue; + } + BasicBlock *DeadCaseBB = I->getCaseSuccessor(); + DeadCaseBB->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB}); + I = SIW->removeCase(I); + E = SIW->case_end(); + } + + auto Case = SI->findCaseValue(Constant); + // If the case value is not found, `findCaseValue` returns the default case. + // In this scenario, since there is no explicit `case 3:`, the simplification + // fails. The simplification also fails when the switch’s default destination + // is reachable. + if (!SI->defaultDestUnreachable() || Case == SI->case_default()) { + if (DTU) + DTU->applyUpdates(Updates); + return !Updates.empty(); + } + + BasicBlock *Unreachable = SI->getDefaultDest(); + SIW.replaceDefaultDest(Case); + SIW.removeCase(Case); + SIW->setCondition(A); + + Updates.push_back({DominatorTree::Delete, BB, Unreachable}); + + if (DTU) + DTU->applyUpdates(Updates); + + return true; +} + /// Tries to transform switch of powers of two to reduce switch range. /// For example, switch like: /// switch (C) { case 1: case 2: case 64: case 128: } @@ -8037,6 +8127,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) { if (simplifyDuplicateSwitchArms(SI, DTU)) return requestResimplify(); + if (simplifySwitchWhenUMin(SI, DTU)) + return requestResimplify(); + return false; } diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp index 94c5c1709f43e..e86ab13094b15 100644 --- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp +++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp @@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix; // Redirect exiting edges through a control flow hub. ControlFlowHub CHub; + bool Changed = false; for (unsigned I = 0; I < ExitingBlocks.size(); ++I) { BasicBlock *BB = ExitingBlocks[I]; @@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { bool UpdatedLI = false; BasicBlock *NewSucc = SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI); + // SplitCallBrEdge modifies the CFG because it creates an intermediate + // block. So we need to set the changed flag no matter what the + // ControlFlowHub is going to do later. + Changed = true; // Even if CallBr and Succ do not have a common parent loop, we need to // add the new target block to the parent loop of the current loop. if (!UpdatedLI) @@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) { bool ChangedCFG; std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize( &DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue()); + ChangedCFG |= Changed; if (!ChangedCFG) return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index fdfff16132093..03112c67dda7b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, bool CanAddPredicate = !llvm::shouldOptimizeForSize( TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass); - int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides, - CanAddPredicate, false).value_or(0); + int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides, + CanAddPredicate, false) + .value_or(0); if (Stride == 1 || Stride == -1) return Stride; return 0; diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 34b405ced8c0a..bf3f52c51b64c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20975,6 +20975,27 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, if (isa<PHINode>(S.getMainOp()) || isVectorLikeInstWithConstOps(S.getMainOp())) return nullptr; + // If the parent node is non-schedulable and the current node is copyable, and + // any of parent instructions are used outside several basic blocks or in + // bin-op node - cancel scheduling, it may cause wrong def-use deps in + // analysis, leading to a crash. + // Non-scheduled nodes may not have related ScheduleData model, which may lead + // to a skipped dep analysis. + if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() && + EI.UserTE->doesNotNeedToSchedule() && + EI.UserTE->getOpcode() != Instruction::PHI && + any_of(EI.UserTE->Scalars, [](Value *V) { + auto *I = dyn_cast<Instruction>(V); + if (!I || I->hasOneUser()) + return false; + for (User *U : I->users()) { + auto *UI = cast<Instruction>(U); + if (isa<BinaryOperator>(UI)) + return true; + } + return false; + })) + return std::nullopt; bool HasCopyables = S.areInstructionsWithCopyableElements(); if (((!HasCopyables && doesNotNeedToSchedule(VL)) || all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e1da070a1fb7f..cfe1f1e9d7528 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1110,9 +1110,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, VP_CLASSOF_IMPL(VPDef::VPInstructionSC) VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); - auto *New = - new VPInstruction(Opcode, Operands, *this, *this, getDebugLoc(), Name); + auto *New = new VPInstruction(Opcode, operands(), *this, *this, + getDebugLoc(), Name); if (getUnderlyingValue()) New->setUnderlyingValue(getUnderlyingInstr()); return New; @@ -1226,10 +1225,9 @@ class VPInstructionWithType : public VPInstruction { } VPInstruction *clone() override { - SmallVector<VPValue *, 2> Operands(operands()); auto *New = - new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this, - getDebugLoc(), getName()); + new VPInstructionWithType(getOpcode(), operands(), getResultType(), + *this, getDebugLoc(), getName()); New->setUnderlyingValue(getUnderlyingValue()); return New; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9d9bb14530539..2588c878d8472 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -154,27 +154,32 @@ static bool sinkScalarOperands(VPlan &Plan) { bool ScalarVFOnly = Plan.hasScalarVFOnly(); bool Changed = false; - auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo, - VPSingleDefRecipe *Candidate) { - // We only know how to duplicate VPReplicateRecipes and - // VPScalarIVStepsRecipes for now. + SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; + auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList]( + VPBasicBlock *SinkTo, VPValue *Op) { + auto *Candidate = + dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()); + if (!Candidate) + return; + + // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes + // for now. if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate)) - return false; + return; if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() || Candidate->mayReadOrWriteMemory()) - return false; + return; if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate)) if (!ScalarVFOnly && RepR->isSingleScalar()) - return false; + return; - return true; + WorkList.insert({SinkTo, Candidate}); }; // First, collect the operands of all recipes in replicate blocks as seeds for // sinking. - SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList; for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) { VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock(); if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2) @@ -182,14 +187,9 @@ static bool sinkScalarOperands(VPlan &Plan) { VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front()); if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock()) continue; - for (auto &Recipe : *VPBB) { - for (VPValue *Op : Recipe.operands()) { - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - if (IsValidSinkCandidate(VPBB, Def)) - WorkList.insert({VPBB, Def}); - } - } + for (auto &Recipe : *VPBB) + for (VPValue *Op : Recipe.operands()) + InsertIfValidSinkCandidate(VPBB, Op); } // Try to sink each replicate or scalar IV steps recipe in the worklist. @@ -198,8 +198,8 @@ static bool sinkScalarOperands(VPlan &Plan) { VPSingleDefRecipe *SinkCandidate; std::tie(SinkTo, SinkCandidate) = WorkList[I]; - // All recipe users of the sink candidate must be in the same block SinkTo - // or all users outside of SinkTo must have only their first lane used. In + // All recipe users of SinkCandidate must be in the same block SinkTo or all + // users outside of SinkTo must only use the first lane of SinkCandidate. In // the latter case, we need to duplicate SinkCandidate. auto UsersOutsideSinkTo = make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) { @@ -234,10 +234,7 @@ static bool sinkScalarOperands(VPlan &Plan) { } SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi()); for (VPValue *Op : SinkCandidate->operands()) - if (auto *Def = - dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe())) - if (IsValidSinkCandidate(SinkTo, Def)) - WorkList.insert({SinkTo, Def}); + InsertIfValidSinkCandidate(SinkTo, Op); Changed = true; } return Changed; diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 91734a10cb2c8..34754a1ea3992 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -252,6 +252,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) { for (const VPUser *U : V->users()) { auto *UI = cast<VPRecipeBase>(U); + if (isa<VPIRPhi>(UI) && + UI->getNumOperands() != UI->getParent()->getNumPredecessors()) { + errs() << "Phi-like recipe with different number of operands and " + "predecessors.\n"; + return false; + } + if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) { for (const auto &[IncomingVPV, IncomingVPBB] : Phi->incoming_values_and_blocks()) { diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d6eb00da11dc8..27a8bbd5776be 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -2017,8 +2017,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) { Value *ScalarV = Ext->getOperand(0); if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV), - &DT)) - ScalarV = Builder.CreateFreeze(ScalarV); + &DT)) { + // Check wether all lanes are extracted, all extracts trigger UB + // on poison, and the last extract (and hence all previous ones) + // are guaranteed to execute if Ext executes. If so, we do not + // need to insert a freeze. + SmallDenseSet<ConstantInt *, 8> ExtractedLanes; + bool AllExtractsTriggerUB = true; + ExtractElementInst *LastExtract = nullptr; + BasicBlock *ExtBB = Ext->getParent(); + for (User *U : Ext->users()) { + auto *Extract = cast<ExtractElementInst>(U); + if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) { + AllExtractsTriggerUB = false; + break; + } + ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand())); + if (!LastExtract || LastExtract->comesBefore(Extract)) + LastExtract = Extract; + } + if (ExtractedLanes.size() != DstTy->getNumElements() || + !AllExtractsTriggerUB || + !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(), + LastExtract->getIterator())) + ScalarV = Builder.CreateFreeze(ScalarV); + } ScalarV = Builder.CreateBitCast( ScalarV, IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy))); diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll index 7e1588f427be4..76f73e43a2355 100644 --- a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll +++ b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll @@ -325,14 +325,14 @@ define void @extaddv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_16_32 = add <4 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <4 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <4 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <4 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64> @@ -434,24 +434,24 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_8_16 = add <8 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <8 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <8 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <8 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <8 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <8 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32> @@ -464,14 +464,14 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <8 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <8 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <8 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <8 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64> @@ -573,24 +573,24 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = add <16 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <16 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <16 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <16 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <16 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <16 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32> @@ -603,14 +603,14 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <16 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <16 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <16 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <16 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64> @@ -1020,14 +1020,14 @@ define void @extsubv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_16_32 = sub <4 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <4 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <4 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <4 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64> @@ -1129,24 +1129,24 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_8_16 = sub <8 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <8 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <8 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <8 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <8 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <8 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32> @@ -1159,14 +1159,14 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <8 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <8 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <8 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <8 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64> @@ -1268,24 +1268,24 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = sub <16 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <16 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <16 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <16 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <16 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <16 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32> @@ -1298,14 +1298,14 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <16 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <16 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <16 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <16 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64> @@ -1715,14 +1715,14 @@ define void @extmulv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_16_32 = mul <4 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <4 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <4 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <4 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64> @@ -1824,24 +1824,24 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of 1 for: %azl_8_16 = mul <8 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <8 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <8 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <8 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <8 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <8 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32> @@ -1854,14 +1854,14 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <8 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <8 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <8 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <8 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64> @@ -1963,24 +1963,24 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = mul <16 x i16> %zl1_8_16, %zl2_8_16 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <16 x i32> %i32, %sw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <16 x i32> %i32, %zw_8_32 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <16 x i64> %i64, %sw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <16 x i64> %i64, %zw_8_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <16 x i32> %i32, %sw_16_32 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32> @@ -1993,14 +1993,14 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6 ; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <16 x i32> %zl1_16_32, %zl2_16_32 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <16 x i64> %i64, %sw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <16 x i64> %i64, %zw_16_64 -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> -; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64 ; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64> ; CHECK-NEXT: Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <16 x i64> %i64, %sw_32_64 ; CHECK-NEXT: Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64> diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll index 32408acb582d0..48537f6012dd5 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sincos.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos" ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -intrinsic-cost-strategy=intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB +; RUN: opt < %s -mtriple=arm64-apple-macos10.9 -mattr=+neon -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefix=SINCOS_STRET %s define void @sincos() { ; CHECK-LABEL: 'sincos' @@ -8,13 +9,11 @@ define void @sincos() { ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) -; ; CHECK: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) ; CHECK: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) ; CHECK: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) ; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) ; CHECK: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) -; ; CHECK: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) ; CHECK: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) @@ -26,18 +25,32 @@ define void @sincos() { ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) -; ; CHECK-VECLIB: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) -; ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) ; CHECK-VECLIB: Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison) ; CHECK-VECLIB: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison) +; +; SINCOS_STRET-LABEL: 'sincos' +; SINCOS_STRET: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison) +; SINCOS_STRET: Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison) +; SINCOS_STRET: Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison) ; %f16 = call { half, half } @llvm.sincos.f16(half poison) %f32 = call { float, float } @llvm.sincos.f32(float poison) diff --git a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll index 4c2a9c3f29f02..d90a97f1651e6 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll @@ -10,7 +10,7 @@ ; s0 += (1ULL << 62) + 1; ; s1 += (1ULL << 62) + 2; ; } -; FIXME: We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine +; We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine ; there are no dependences), as the pointers are not dereferenced in all loop iterations. define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) { ; CHECK-LABEL: 'test_inbounds_gep_used_in_predicated_block' @@ -19,9 +19,14 @@ define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) { ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A)) +; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header> +; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw> ; CHECK-EMPTY: ; CHECK-NEXT: Expressions re-written: ; @@ -63,9 +68,14 @@ define void @test_inbounds_gep_used_in_predicated_block_stored_value_operand(ptr ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A)) +; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header> +; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were found in loop. ; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw> ; CHECK-EMPTY: ; CHECK-NEXT: Expressions re-written: ; @@ -109,9 +119,14 @@ define void @test_inbounds_gep_used_in_predicated_block_non_memop_user(ptr %A, i ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group GRP0: +; CHECK-NEXT: (Low: %A High: (-4611686018427387705 + %A)) +; CHECK-NEXT: Member: {%A,+,4611686018427387906}<%loop.header> +; CHECK-NEXT: Member: {%A,+,4611686018427387905}<%loop.header> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw> ; CHECK-EMPTY: ; CHECK-NEXT: Expressions re-written: ; diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll index aef7810fe2c3b..107a98aebeeb8 100644 --- a/llvm/test/Bitcode/attributes.ll +++ b/llvm/test/Bitcode/attributes.ll @@ -521,6 +521,11 @@ define void @f_sanitize_alloc_token() sanitize_alloc_token { ret void; } +; CHECK: define void @f_no_create_undef_or_poison() #56 +define void @f_no_create_undef_or_poison() nocreateundeforpoison { + ret void; +} + ; CHECK: define void @f87() [[FNRETTHUNKEXTERN:#[0-9]+]] define void @f87() fn_ret_thunk_extern { ret void } @@ -633,6 +638,7 @@ define void @dead_on_return(ptr dead_on_return %p) { ; CHECK: attributes #53 = { sanitize_realtime } ; CHECK: attributes #54 = { sanitize_realtime_blocking } ; CHECK: attributes #55 = { sanitize_alloc_token } +; CHECK: attributes #56 = { nocreateundeforpoison } ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern } ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile } ; CHECK: attributes [[OPTDEBUG]] = { optdebug } diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll index cad5df0d9655e..68ab8902767b3 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll @@ -430,12 +430,12 @@ declare i32 @foo() ; Test case distilled from 126.gcc. ; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor. -define void @build_modify_expr() nounwind ssp { +define void @build_modify_expr(i32 %cond) nounwind ssp { ; CHECK-LABEL: build_modify_expr: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: ret entry: - switch i32 undef, label %sw.bb.i.i [ + switch i32 %cond, label %sw.bb.i.i [ i32 69, label %if.end85 i32 70, label %if.end85 i32 71, label %if.end85 diff --git a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll index bdbc99e2d98b0..75e7ac902274d 100644 --- a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll +++ b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll @@ -2,15 +2,58 @@ declare void @called() declare void @escaped() -define void @f(ptr %dst) { +define void @f(ptr %dst, ptr readonly %f) { call void @called() +; CHECK: bl "#called" store ptr @escaped, ptr %dst - ret void + call void %f() +; CHECK: adrp x10, $iexit_thunk$cdecl$v$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$v$v +; CHECK-NEXT: str x8, [x20] +; CHECK-NEXT: adrp x8, __os_arm64x_check_icall_cfg +; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall_cfg] +; CHECK-NEXT: mov x11, +; CHECK-NEXT: blr x8 +; CHECK-NEXT: blr x11 + ret void } +; CHECK-LABEL: .def "#called$exit_thunk"; +; CHECK-NEXT: .scl 2; +; CHECK-NEXT: .type 32; +; CHECK-NEXT: .endef +; CHECK-NEXT: .section .wowthk$aa,"xr",discard,"#called$exit_thunk" +; CHECK-NEXT: .globl "#called$exit_thunk" // -- Begin function #called$exit_thunk +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: "#called$exit_thunk": // @"#called$exit_thunk" +; CHECK-NEXT: .weak_anti_dep called +; CHECK-NEXT: called = "#called" +; CHECK-NEXT: .weak_anti_dep "#called" +; CHECK-NEXT: "#called" = "#called$exit_thunk" +; CHECK-NEXT: .seh_proc "#called$exit_thunk" +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .seh_save_reg_x x30, 16 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: adrp x8, __os_arm64x_check_icall +; CHECK-NEXT: adrp x11, called +; CHECK-NEXT: add x11, x11, :lo12:called +; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$v$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$v$v +; CHECK-NEXT: blr x8 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg_x x30, 16 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: br x11 +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc + !llvm.module.flags = !{!0} -!0 = !{i32 2, !"cfguard", i32 1} +!0 = !{i32 2, !"cfguard", i32 2} ; CHECK-LABEL: .section .gfids$y,"dr" ; CHECK-NEXT: .symidx escaped +; CHECK-NEXT: .symidx $iexit_thunk$cdecl$v$v ; CHECK-NOT: .symidx diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir index f535e0fe8b387..bb7ffb47d8dfe 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir @@ -162,63 +162,54 @@ body: | RET_ReallyLR # CHECK-LABEL: name: test_allocate_split_sve_realigned -# CHECK: stackSize: 2080 +# CHECK: stackSize: 1056 # CHECK: bb.0.entry: # CHECK: liveins: $z0, $p0, $lr -# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0 -# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 -# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5) -# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4) -# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0 +# CHECK: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.5), (store (s64) into %stack.4) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 -# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0 -# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg -# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 2064, 0 +# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $x9, -3, implicit $vg +# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]], 7930 # # CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 # CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg -# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0) -# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 -# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1) +# CHECK-NEXT: STR_ZXI $z0, killed $x8, -2 :: (store (<vscale x 1 x s128>) into %stack.0) +# CHECK-NEXT: STR_PXI $p0, $fp, -6 :: (store (<vscale x 1 x s16>) into %stack.1) # -# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0 -# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040 -# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4) -# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5) -# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 +# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.5), (load (s64) from %stack.4) # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 # CHECK-NEXT: RET_ReallyLR # ASM-LABEL: test_allocate_split_sve_realigned -# ASM: sub sp, sp, #1040 -# ASM-NEXT: .cfi_def_cfa_offset 1040 -# ASM-NEXT: str x29, [sp, #1024] -# ASM-NEXT: str x30, [sp, #1032] -# ASM-NEXT: add x29, sp, #1024 +# ASM: stp x29, x30, [sp, #-16]! +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: mov x29, sp # ASM-NEXT: .cfi_def_cfa w29, 16 # ASM-NEXT: .cfi_offset w30, -8 # ASM-NEXT: .cfi_offset w29, -16 # -# ASM: sub sp, x29, #1024 -# ASM-NEXT: .cfi_def_cfa wsp, 1040 -# ASM-NEXT: ldr x30, [sp, #1032] -# ASM-NEXT: ldr x29, [sp, #1024] -# ASM-NEXT: add sp, sp, #1040 +# ASM: mov sp, x29 +# ASM-NEXT: .cfi_def_cfa wsp, 16 +# ASM-NEXT: ldp x29, x30, [sp], #16 # ASM-NEXT: .cfi_def_cfa_offset 0 # ASM-NEXT: .cfi_restore w30 # ASM-NEXT: .cfi_restore w29 -# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 # UNWINDINFO: DW_CFA_def_cfa: reg29 +16 # UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 # -# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040 +# UNWINDINFO: DW_CFA_def_cfa: reg31 +16 # UNWINDINFO: DW_CFA_def_cfa_offset: +0 # UNWINDINFO-NEXT: DW_CFA_restore: reg30 # UNWINDINFO-NEXT: DW_CFA_restore: reg29 diff --git a/llvm/test/CodeGen/AArch64/ldst-implicitop.mir b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir new file mode 100644 index 0000000000000..34e8cf282669c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir @@ -0,0 +1,80 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=aarch64-- -run-pass=aarch64-ldst-opt -verify-machineinstrs -o - %s | FileCheck %s +# Check that we copy implicit operands. +--- +name: impdef_op1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_op1 + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: RET_ReallyLR + renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + renamable $q20 = LDRQui renamable $lr, 4 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + RET_ReallyLR +... +--- +name: impdef_op2 +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_op2 + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q20, renamable $q5 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: RET_ReallyLR + renamable $q20 = LDRQui renamable $lr, 3 :: (load (s128)) + renamable $q5 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + RET_ReallyLR +... +--- +name: impdef_both +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_both + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5, implicit-def $q20_q21 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: $q2 = ORRv16i8 $q20, killed $q20 + ; CHECK-NEXT: $q3 = ORRv16i8 $q21, killed $q21 + ; CHECK-NEXT: RET_ReallyLR + renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q20_q21 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + $q2 = ORRv16i8 $q20, killed $q20 + $q3 = ORRv16i8 $q21, killed $q21 + RET_ReallyLR +... +--- +name: impdef_both_same +body: | + bb.0: + liveins: $lr + ; CHECK-LABEL: name: impdef_both_same + ; CHECK: liveins: $lr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4 + ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5 + ; CHECK-NEXT: RET_ReallyLR + renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128)) + renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128)) + $q0 = ORRv16i8 $q4, killed $q4 + $q1 = ORRv16i8 $q5, killed $q5 + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll index 7f0968c8eb339..f77ada4eae022 100644 --- a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll +++ b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck -check-prefix CHECK -check-prefix CHECK-DARWIN %s +; RUN: llc < %s -mtriple=aarch64-unknown-windows-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s declare void @standard_cc_func() declare preserve_mostcc void @preserve_mostcc_func() @@ -8,18 +9,26 @@ declare preserve_mostcc void @preserve_mostcc_func() define preserve_mostcc void @preserve_mostcc1() nounwind { entry: ;CHECK-LABEL: preserve_mostcc1 -;CHECK-NOT: stp -;CHECK-NOT: str -;CHECK: str x15 -;CHECK-NEXT: stp x14, x13, -;CHECK-NEXT: stp x12, x11, -;CHECK-NEXT: stp x10, x9, -;CHECK: bl _standard_cc_func +;CHECK-DARWIN-NOT: stp +;CHECK-DARWIN-NOT: str +;CHECK-DARWIN: str x15 +;CHECK-DARWIN-NEXT: stp x14, x13, +;CHECK-DARWIN-NEXT: stp x12, x11, +;CHECK-DARWIN-NEXT: stp x10, x9, +;CHECK-WIN: stp x15, x14 +;CHECK-WIN-NEXT: stp x13, x12, +;CHECK-WIN-NEXT: stp x11, x10, +;CHECK-WIN-NEXT: stp x9, x30 +;CHECK: bl {{_?}}standard_cc_func call void @standard_cc_func() -;CHECK: ldp x10, x9, -;CHECK-NEXT: ldp x12, x11, -;CHECK-NEXT: ldp x14, x13, -;CHECK-NEXT: ldr x15 +;CHECK-DARWIN: ldp x10, x9, +;CHECK-DARWIN-NEXT: ldp x12, x11, +;CHECK-DARWIN-NEXT: ldp x14, x13, +;CHECK-DARWIN-NEXT: ldr x15 +;CHECK-WIN: ldp x9, x30 +;CHECK-WIN-NEXT: ldp x11, x10, +;CHECK-WIN-NEXT: ldp x13, x12, +;CHECK-WIN-NEXT: ldp x15, x14, ret void } @@ -31,9 +40,10 @@ define preserve_mostcc void @preserve_mostcc2() nounwind { entry: ;CHECK-LABEL: preserve_mostcc2 ;CHECK-NOT: x14 -;CHECK: stp x29, x30, +;CHECK-DARWIN: stp x29, x30, +;CHECK-WIN: str x30 ;CHECK-NOT: x14 -;CHECK: bl _preserve_mostcc_func +;CHECK: bl {{_?}}preserve_mostcc_func call preserve_mostcc void @preserve_mostcc_func() ret void } diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll index 71c6380177b3a..8a0ac6d4ace7a 100644 --- a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll +++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll @@ -780,6 +780,7 @@ define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) #0 { attributes #0 = { "target-features"="+sve" } ;. -; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR1]] = { "target-features"="+sve" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll index c13dd33865c37..f65aec6665cec 100644 --- a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll @@ -737,36 +737,23 @@ entry: } declare ptr @memset(ptr, i32, i32) -; FIXME: aarch64-split-sve-objects is currently not supported in this function -; as it requires stack reealignment (for the 32-byte aligned alloca). -; GPR CSRs -; <hazard padding> -; FPR CSRs -; <hazrd padding> -; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here! -; <realignment padding> -; -> sp define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: zpr_and_ppr_local_realignment: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #1040 -; CHECK-NEXT: sub x9, sp, #1040 -; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK-NEXT: add x29, sp, #1024 +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: sub x9, sp, #2064 +; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: addvl x9, x9, #-2 -; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill ; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK-NEXT: .cfi_def_cfa w29, 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: sub x8, x29, #1024 -; CHECK-NEXT: str p0, [x8, #-1, mul vl] +; CHECK-NEXT: str p0, [x29, #-1, mul vl] ; CHECK-NEXT: str z0, [x8, #-2, mul vl] ; CHECK-NEXT: str x0, [sp] -; CHECK-NEXT: sub sp, x29, #1024 -; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %ppr_local = alloca <vscale x 16 x i1> %zpr_local = alloca <vscale x 16 x i8> @@ -805,3 +792,316 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x store volatile i64 %gpr, ptr %gpr_local ret void } + +; Only PPR callee-saves + a VLA +; Expect: No hazard padding. Frame pointer (x29), p4-p6 callee saves allocated +; with `addvl #-1`, PPR saves restored using frame pointer `addvl sp, x29, #-1`. +define aarch64_sve_vector_pcs void @only_ppr_csr_vla(i64 %n) { +; CHECK-LABEL: only_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + call void (...) @llvm.fake.use(ptr %alloc) + tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"() + ret void +} + +; Only ZPR callee-saves + a VLA +; Expect: Hazard padding, Frame pointer (x29), z8-z10 callee saves allocated +; with `addvl #-3`. ZPR saves restored from `FP - 1024 + addvl #-3`. +define aarch64_sve_vector_pcs void @only_zpr_csr_vla(i64 %n) { +; CHECK-LABEL: only_zpr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #1056 +; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #1024 +; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #1040] // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: addvl sp, x8, #-3 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: sub sp, x29, #1024 +; CHECK-NEXT: ldr x19, [sp, #1040] // 8-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1056 +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + call void (...) @llvm.fake.use(ptr %alloc) + tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"() + ret void +} + +; PPR+ZPR callee-saves + a VLA +; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10) +; callee-saves allocated separately, with hazard padding of 1024 between the +; areas. ZPR callee saves restored by `FP - 1024 + addvl #-4`, PPR callee saves +; restored by `FP + addvl #-1`. +define aarch64_sve_vector_pcs void @zpr_ppr_csr_vla(i64 %n) { +; CHECK-LABEL: zpr_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: addvl sp, x8, #-4 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + call void (...) @llvm.fake.use(ptr %alloc) + tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"() + ret void +} + +; Only PPR callee-saves (and ZPR/PPR locals) + a VLA +; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) callee-saves, with +; hazard padding after the PPR callee saves (1024) and after the FPR local area +; (1024) -- coeleased to 2048. Only PPRs restored by moving the SP to +; `FP + addvl #-1`. +define void @sve_locals_only_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) { +; CHECK-LABEL: sve_locals_only_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #2048 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-9, mul vl] +; CHECK-NEXT: str z0, [x8, #-3, mul vl] +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"() + call void (...) @llvm.fake.use(ptr %alloc) + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} + +; Only ZPR callee-saves (and ZPR/PPR locals) + a VLA +; Expect: Hazard padding, Frame pointer (x29), ZPR (z8-z10) callee-saves, with +; hazard padding before the ZPR callee saves (1024) and after the ZPR local area +; (1024). Only ZPRs restored by moving the SP to `FP - 1024 + addvl #-4`. +define void @sve_locals_only_zpr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) { +; CHECK-LABEL: sve_locals_only_zpr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str z0, [x8, #-5, mul vl] +; CHECK-NEXT: addvl sp, x8, #-4 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"() + call void (...) @llvm.fake.use(ptr %alloc) + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} + +; PPR+ZPR callee-saves (and ZPR/PPR locals) + a VLA +; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10) +; callee-saves, with hazard padding before the ZPR callee saves (1024) and after +; the ZPR local area (1024). ZPRs restored by moving the SP to +; `FP - 1024 + addvl #-5`, PPRs restored by moving SP to `FP + addvl #-1`. +define void @sve_locals_zpr_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) { +; CHECK-LABEL: sve_locals_zpr_ppr_csr_vla: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: str z10, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1056 +; CHECK-NEXT: add x9, x0, #15 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: // fake_use: $x8 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-9, mul vl] +; CHECK-NEXT: str z0, [x8, #-6, mul vl] +; CHECK-NEXT: addvl sp, x8, #-5 +; CHECK-NEXT: ldr z10, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, x29, #-1 +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %alloc = alloca i8, i64 %n, align 1 + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"() + call void (...) @llvm.fake.use(ptr %alloc) + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index bdee359487ce6..70874761b82ab 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -3512,14 +3512,13 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; ; CHECK64-LABEL: svecc_call_dynamic_alloca: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: .cfi_def_cfa_offset 128 +; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 64 ; CHECK64-NEXT: cntd x9 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: stp x20, x19, [sp, #112] // 16-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #64 +; CHECK64-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: mov x29, sp ; CHECK64-NEXT: .cfi_def_cfa w29, 64 ; CHECK64-NEXT: .cfi_offset w19, -8 ; CHECK64-NEXT: .cfi_offset w20, -16 @@ -3529,7 +3528,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: .cfi_offset vg, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -3542,30 +3541,32 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-16 +; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128 ; CHECK64-NEXT: sub sp, sp, #64 ; CHECK64-NEXT: mov x19, sp ; CHECK64-NEXT: mov w2, w1 @@ -3595,22 +3596,31 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: sub x8, x29, #64 ; CHECK64-NEXT: movk w0, #59491, lsl #16 ; CHECK64-NEXT: addvl sp, x8, #-18 -; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, x29, #-2 +; CHECK64-NEXT: .cfi_restore z8 +; CHECK64-NEXT: .cfi_restore z9 +; CHECK64-NEXT: .cfi_restore z10 +; CHECK64-NEXT: .cfi_restore z11 +; CHECK64-NEXT: .cfi_restore z12 +; CHECK64-NEXT: .cfi_restore z13 +; CHECK64-NEXT: .cfi_restore z14 +; CHECK64-NEXT: .cfi_restore z15 ; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload @@ -3623,21 +3633,12 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: .cfi_restore z8 -; CHECK64-NEXT: .cfi_restore z9 -; CHECK64-NEXT: .cfi_restore z10 -; CHECK64-NEXT: .cfi_restore z11 -; CHECK64-NEXT: .cfi_restore z12 -; CHECK64-NEXT: .cfi_restore z13 -; CHECK64-NEXT: .cfi_restore z14 -; CHECK64-NEXT: .cfi_restore z15 -; CHECK64-NEXT: sub sp, x29, #64 -; CHECK64-NEXT: .cfi_def_cfa wsp, 128 -; CHECK64-NEXT: ldp x20, x19, [sp, #112] // 16-byte Folded Reload -; CHECK64-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload -; CHECK64-NEXT: ldp x27, x26, [sp, #96] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: .cfi_def_cfa wsp, 64 +; CHECK64-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: .cfi_def_cfa_offset 0 ; CHECK64-NEXT: .cfi_restore w19 ; CHECK64-NEXT: .cfi_restore w20 @@ -3649,305 +3650,444 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x ; CHECK64-NEXT: .cfi_restore w29 ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call_dynamic_alloca: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 1088 -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -8 -; CHECK1024-NEXT: .cfi_offset w20, -16 -; CHECK1024-NEXT: .cfi_offset w26, -24 -; CHECK1024-NEXT: .cfi_offset w27, -32 -; CHECK1024-NEXT: .cfi_offset w28, -40 -; CHECK1024-NEXT: .cfi_offset vg, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: mov x19, sp -; CHECK1024-NEXT: mov w2, w1 -; CHECK1024-NEXT: mov w8, w0 -; CHECK1024-NEXT: bl __arm_sme_state -; CHECK1024-NEXT: mov w8, w8 -; CHECK1024-NEXT: mov x9, sp -; CHECK1024-NEXT: mov x20, x0 -; CHECK1024-NEXT: add x8, x8, #15 -; CHECK1024-NEXT: and x8, x8, #0x1fffffff0 -; CHECK1024-NEXT: sub x8, x9, x8 -; CHECK1024-NEXT: mov sp, x8 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: tbz w20, #0, .LBB35_2 -; CHECK1024-NEXT: // %bb.1: // %entry -; CHECK1024-NEXT: smstop sm -; CHECK1024-NEXT: .LBB35_2: // %entry -; CHECK1024-NEXT: mov x0, x8 -; CHECK1024-NEXT: mov w1, #45 // =0x2d -; CHECK1024-NEXT: bl memset -; CHECK1024-NEXT: tbz w20, #0, .LBB35_4 -; CHECK1024-NEXT: // %bb.3: // %entry -; CHECK1024-NEXT: smstart sm -; CHECK1024-NEXT: .LBB35_4: // %entry -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: sub x8, x29, #1024 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: addvl sp, x8, #-18 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: .cfi_restore z8 -; CHECK1024-NEXT: .cfi_restore z9 -; CHECK1024-NEXT: .cfi_restore z10 -; CHECK1024-NEXT: .cfi_restore z11 -; CHECK1024-NEXT: .cfi_restore z12 -; CHECK1024-NEXT: .cfi_restore z13 -; CHECK1024-NEXT: .cfi_restore z14 -; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: sub sp, x29, #1024 -; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088 -; CHECK1024-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 0 -; CHECK1024-NEXT: .cfi_restore w19 -; CHECK1024-NEXT: .cfi_restore w20 -; CHECK1024-NEXT: .cfi_restore w26 -; CHECK1024-NEXT: .cfi_restore w27 -; CHECK1024-NEXT: .cfi_restore w28 -; CHECK1024-NEXT: .cfi_restore vg -; CHECK1024-NEXT: .cfi_restore w30 -; CHECK1024-NEXT: .cfi_restore w29 -; CHECK1024-NEXT: ret -entry: - %ptr = alloca i8, i32 %P1 - tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 - %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2) - ret i32 -396142473 -} - - -define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" { -; CHECK0-LABEL: svecc_call_realign: -; CHECK0: // %bb.0: // %entry -; CHECK0-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill -; CHECK0-NEXT: .cfi_def_cfa_offset 64 -; CHECK0-NEXT: cntd x9 -; CHECK0-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill -; CHECK0-NEXT: str x9, [sp, #16] // 8-byte Folded Spill -; CHECK0-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill -; CHECK0-NEXT: mov x29, sp -; CHECK0-NEXT: .cfi_def_cfa w29, 64 -; CHECK0-NEXT: .cfi_offset w19, -8 -; CHECK0-NEXT: .cfi_offset w26, -16 -; CHECK0-NEXT: .cfi_offset w27, -24 -; CHECK0-NEXT: .cfi_offset w28, -32 -; CHECK0-NEXT: .cfi_offset vg, -48 -; CHECK0-NEXT: .cfi_offset w30, -56 -; CHECK0-NEXT: .cfi_offset w29, -64 -; CHECK0-NEXT: addvl sp, sp, #-18 -; CHECK0-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64 -; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64 -; CHECK0-NEXT: sub x9, sp, #1024 -; CHECK0-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK0-NEXT: mov w2, w1 -; CHECK0-NEXT: bl __arm_sme_state -; CHECK0-NEXT: mov x19, x0 -; CHECK0-NEXT: //APP -; CHECK0-NEXT: //NO_APP -; CHECK0-NEXT: tbz w19, #0, .LBB36_2 -; CHECK0-NEXT: // %bb.1: // %entry -; CHECK0-NEXT: smstop sm -; CHECK0-NEXT: .LBB36_2: // %entry -; CHECK0-NEXT: mov x0, sp -; CHECK0-NEXT: mov w1, #45 // =0x2d -; CHECK0-NEXT: bl memset -; CHECK0-NEXT: tbz w19, #0, .LBB36_4 -; CHECK0-NEXT: // %bb.3: // %entry -; CHECK0-NEXT: smstart sm -; CHECK0-NEXT: .LBB36_4: // %entry -; CHECK0-NEXT: mov w0, #22647 // =0x5877 -; CHECK0-NEXT: movk w0, #59491, lsl #16 -; CHECK0-NEXT: addvl sp, x29, #-18 -; CHECK0-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK0-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: .cfi_restore z8 -; CHECK0-NEXT: .cfi_restore z9 -; CHECK0-NEXT: .cfi_restore z10 -; CHECK0-NEXT: .cfi_restore z11 -; CHECK0-NEXT: .cfi_restore z12 -; CHECK0-NEXT: .cfi_restore z13 -; CHECK0-NEXT: .cfi_restore z14 -; CHECK0-NEXT: .cfi_restore z15 -; CHECK0-NEXT: mov sp, x29 -; CHECK0-NEXT: .cfi_def_cfa wsp, 64 -; CHECK0-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload -; CHECK0-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload -; CHECK0-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload -; CHECK0-NEXT: .cfi_def_cfa_offset 0 -; CHECK0-NEXT: .cfi_restore w19 -; CHECK0-NEXT: .cfi_restore w26 -; CHECK0-NEXT: .cfi_restore w27 -; CHECK0-NEXT: .cfi_restore w28 -; CHECK0-NEXT: .cfi_restore vg -; CHECK0-NEXT: .cfi_restore w30 -; CHECK0-NEXT: .cfi_restore w29 -; CHECK0-NEXT: ret -; -; CHECK64-LABEL: svecc_call_realign: -; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: .cfi_def_cfa_offset 128 -; CHECK64-NEXT: cntd x9 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x27, x26, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #64 -; CHECK64-NEXT: .cfi_def_cfa w29, 64 -; CHECK64-NEXT: .cfi_offset w19, -16 -; CHECK64-NEXT: .cfi_offset w26, -24 -; CHECK64-NEXT: .cfi_offset w27, -32 -; CHECK64-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_alloca: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088 +; CHECK1024-NOSPLITSVE-NEXT: cntd x9 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x20, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1080] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w20, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: mov x19, sp +; CHECK1024-NOSPLITSVE-NEXT: mov w2, w1 +; CHECK1024-NOSPLITSVE-NEXT: mov w8, w0 +; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-NOSPLITSVE-NEXT: mov w8, w8 +; CHECK1024-NOSPLITSVE-NEXT: mov x9, sp +; CHECK1024-NOSPLITSVE-NEXT: mov x20, x0 +; CHECK1024-NOSPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-NOSPLITSVE-NEXT: and x8, x8, #0x1fffffff0 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x9, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov sp, x8 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: tbz w20, #0, .LBB35_2 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstop sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB35_2: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov x0, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NOSPLITSVE-NEXT: bl memset +; CHECK1024-NOSPLITSVE-NEXT: tbz w20, #0, .LBB35_4 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstart sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB35_4: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1080] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x20, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w20 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_alloca: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64 +; CHECK1024-SPLITSVE-NEXT: cntd x9 +; CHECK1024-SPLITSVE-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w20, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: mov x19, sp +; CHECK1024-SPLITSVE-NEXT: mov w2, w1 +; CHECK1024-SPLITSVE-NEXT: mov w8, w0 +; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-SPLITSVE-NEXT: mov w8, w8 +; CHECK1024-SPLITSVE-NEXT: mov x9, sp +; CHECK1024-SPLITSVE-NEXT: mov x20, x0 +; CHECK1024-SPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-SPLITSVE-NEXT: and x8, x8, #0x1fffffff0 +; CHECK1024-SPLITSVE-NEXT: sub x8, x9, x8 +; CHECK1024-SPLITSVE-NEXT: mov sp, x8 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: tbz w20, #0, .LBB35_2 +; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-SPLITSVE-NEXT: smstop sm +; CHECK1024-SPLITSVE-NEXT: .LBB35_2: // %entry +; CHECK1024-SPLITSVE-NEXT: mov x0, x8 +; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-SPLITSVE-NEXT: bl memset +; CHECK1024-SPLITSVE-NEXT: tbz w20, #0, .LBB35_4 +; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-SPLITSVE-NEXT: smstart sm +; CHECK1024-SPLITSVE-NEXT: .LBB35_4: // %entry +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: mov sp, x29 +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64 +; CHECK1024-SPLITSVE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w20 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-SPLITSVE-NEXT: ret +entry: + %ptr = alloca i8, i32 %P1 + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2) + ret i32 -396142473 +} + + +define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" { +; CHECK0-LABEL: svecc_call_realign: +; CHECK0: // %bb.0: // %entry +; CHECK0-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_def_cfa_offset 64 +; CHECK0-NEXT: cntd x9 +; CHECK0-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK0-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK0-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK0-NEXT: mov x29, sp +; CHECK0-NEXT: .cfi_def_cfa w29, 64 +; CHECK0-NEXT: .cfi_offset w19, -8 +; CHECK0-NEXT: .cfi_offset w26, -16 +; CHECK0-NEXT: .cfi_offset w27, -24 +; CHECK0-NEXT: .cfi_offset w28, -32 +; CHECK0-NEXT: .cfi_offset vg, -48 +; CHECK0-NEXT: .cfi_offset w30, -56 +; CHECK0-NEXT: .cfi_offset w29, -64 +; CHECK0-NEXT: addvl sp, sp, #-18 +; CHECK0-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK0-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK0-NEXT: .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64 +; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64 +; CHECK0-NEXT: sub x9, sp, #1024 +; CHECK0-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK0-NEXT: mov w2, w1 +; CHECK0-NEXT: bl __arm_sme_state +; CHECK0-NEXT: mov x19, x0 +; CHECK0-NEXT: //APP +; CHECK0-NEXT: //NO_APP +; CHECK0-NEXT: tbz w19, #0, .LBB36_2 +; CHECK0-NEXT: // %bb.1: // %entry +; CHECK0-NEXT: smstop sm +; CHECK0-NEXT: .LBB36_2: // %entry +; CHECK0-NEXT: mov x0, sp +; CHECK0-NEXT: mov w1, #45 // =0x2d +; CHECK0-NEXT: bl memset +; CHECK0-NEXT: tbz w19, #0, .LBB36_4 +; CHECK0-NEXT: // %bb.3: // %entry +; CHECK0-NEXT: smstart sm +; CHECK0-NEXT: .LBB36_4: // %entry +; CHECK0-NEXT: mov w0, #22647 // =0x5877 +; CHECK0-NEXT: movk w0, #59491, lsl #16 +; CHECK0-NEXT: addvl sp, x29, #-18 +; CHECK0-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK0-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK0-NEXT: .cfi_restore z8 +; CHECK0-NEXT: .cfi_restore z9 +; CHECK0-NEXT: .cfi_restore z10 +; CHECK0-NEXT: .cfi_restore z11 +; CHECK0-NEXT: .cfi_restore z12 +; CHECK0-NEXT: .cfi_restore z13 +; CHECK0-NEXT: .cfi_restore z14 +; CHECK0-NEXT: .cfi_restore z15 +; CHECK0-NEXT: mov sp, x29 +; CHECK0-NEXT: .cfi_def_cfa wsp, 64 +; CHECK0-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK0-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK0-NEXT: .cfi_def_cfa_offset 0 +; CHECK0-NEXT: .cfi_restore w19 +; CHECK0-NEXT: .cfi_restore w26 +; CHECK0-NEXT: .cfi_restore w27 +; CHECK0-NEXT: .cfi_restore w28 +; CHECK0-NEXT: .cfi_restore vg +; CHECK0-NEXT: .cfi_restore w30 +; CHECK0-NEXT: .cfi_restore w29 +; CHECK0-NEXT: ret +; +; CHECK64-LABEL: svecc_call_realign: +; CHECK64: // %bb.0: // %entry +; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_def_cfa_offset 64 +; CHECK64-NEXT: cntd x9 +; CHECK64-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK64-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: mov x29, sp +; CHECK64-NEXT: .cfi_def_cfa w29, 64 +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w26, -16 +; CHECK64-NEXT: .cfi_offset w27, -24 +; CHECK64-NEXT: .cfi_offset w28, -32 ; CHECK64-NEXT: .cfi_offset vg, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -3960,30 +4100,32 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-16 +; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128 ; CHECK64-NEXT: sub x9, sp, #1088 ; CHECK64-NEXT: and sp, x9, #0xffffffffffffffe0 ; CHECK64-NEXT: mov w2, w1 @@ -4006,22 +4148,31 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: sub x8, x29, #64 ; CHECK64-NEXT: movk w0, #59491, lsl #16 ; CHECK64-NEXT: addvl sp, x8, #-18 -; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, x29, #-2 +; CHECK64-NEXT: .cfi_restore z8 +; CHECK64-NEXT: .cfi_restore z9 +; CHECK64-NEXT: .cfi_restore z10 +; CHECK64-NEXT: .cfi_restore z11 +; CHECK64-NEXT: .cfi_restore z12 +; CHECK64-NEXT: .cfi_restore z13 +; CHECK64-NEXT: .cfi_restore z14 +; CHECK64-NEXT: .cfi_restore z15 ; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload @@ -4034,20 +4185,11 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: .cfi_restore z8 -; CHECK64-NEXT: .cfi_restore z9 -; CHECK64-NEXT: .cfi_restore z10 -; CHECK64-NEXT: .cfi_restore z11 -; CHECK64-NEXT: .cfi_restore z12 -; CHECK64-NEXT: .cfi_restore z13 -; CHECK64-NEXT: .cfi_restore z14 -; CHECK64-NEXT: .cfi_restore z15 -; CHECK64-NEXT: sub sp, x29, #64 -; CHECK64-NEXT: .cfi_def_cfa wsp, 128 -; CHECK64-NEXT: ldp x26, x19, [sp, #104] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x28, x27, [sp, #88] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: .cfi_def_cfa wsp, 64 +; CHECK64-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: .cfi_def_cfa_offset 0 ; CHECK64-NEXT: .cfi_restore w19 ; CHECK64-NEXT: .cfi_restore w26 @@ -4058,140 +4200,270 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i ; CHECK64-NEXT: .cfi_restore w29 ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call_realign: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 1088 -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -16 -; CHECK1024-NEXT: .cfi_offset w26, -24 -; CHECK1024-NEXT: .cfi_offset w27, -32 -; CHECK1024-NEXT: .cfi_offset w28, -40 -; CHECK1024-NEXT: .cfi_offset vg, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 -; CHECK1024-NEXT: sub x9, sp, #2048 -; CHECK1024-NEXT: and sp, x9, #0xffffffffffffffe0 -; CHECK1024-NEXT: mov w2, w1 -; CHECK1024-NEXT: bl __arm_sme_state -; CHECK1024-NEXT: mov x19, x0 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: tbz w19, #0, .LBB36_2 -; CHECK1024-NEXT: // %bb.1: // %entry -; CHECK1024-NEXT: smstop sm -; CHECK1024-NEXT: .LBB36_2: // %entry -; CHECK1024-NEXT: mov x0, sp -; CHECK1024-NEXT: mov w1, #45 // =0x2d -; CHECK1024-NEXT: bl memset -; CHECK1024-NEXT: tbz w19, #0, .LBB36_4 -; CHECK1024-NEXT: // %bb.3: // %entry -; CHECK1024-NEXT: smstart sm -; CHECK1024-NEXT: .LBB36_4: // %entry -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: sub x8, x29, #1024 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: addvl sp, x8, #-18 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: .cfi_restore z8 -; CHECK1024-NEXT: .cfi_restore z9 -; CHECK1024-NEXT: .cfi_restore z10 -; CHECK1024-NEXT: .cfi_restore z11 -; CHECK1024-NEXT: .cfi_restore z12 -; CHECK1024-NEXT: .cfi_restore z13 -; CHECK1024-NEXT: .cfi_restore z14 -; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: sub sp, x29, #1024 -; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088 -; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 0 -; CHECK1024-NEXT: .cfi_restore w19 -; CHECK1024-NEXT: .cfi_restore w26 -; CHECK1024-NEXT: .cfi_restore w27 -; CHECK1024-NEXT: .cfi_restore w28 -; CHECK1024-NEXT: .cfi_restore vg -; CHECK1024-NEXT: .cfi_restore w30 -; CHECK1024-NEXT: .cfi_restore w29 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_call_realign: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088 +; CHECK1024-NOSPLITSVE-NEXT: cntd x9 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: sub x9, sp, #2048 +; CHECK1024-NOSPLITSVE-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK1024-NOSPLITSVE-NEXT: mov w2, w1 +; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB36_2 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstop sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB36_2: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov x0, sp +; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NOSPLITSVE-NEXT: bl memset +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB36_4 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstart sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB36_4: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call_realign: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64 +; CHECK1024-SPLITSVE-NEXT: cntd x9 +; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: sub x9, sp, #2048 +; CHECK1024-SPLITSVE-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK1024-SPLITSVE-NEXT: mov w2, w1 +; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-SPLITSVE-NEXT: mov x19, x0 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB36_2 +; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-SPLITSVE-NEXT: smstop sm +; CHECK1024-SPLITSVE-NEXT: .LBB36_2: // %entry +; CHECK1024-SPLITSVE-NEXT: mov x0, sp +; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-SPLITSVE-NEXT: bl memset +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB36_4 +; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-SPLITSVE-NEXT: smstart sm +; CHECK1024-SPLITSVE-NEXT: .LBB36_4: // %entry +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: mov sp, x29 +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64 +; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-SPLITSVE-NEXT: ret entry: %ptr = alloca i8, i32 1000, align 32 tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 @@ -4311,13 +4583,12 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; ; CHECK64-LABEL: svecc_call_dynamic_and_scalable_alloca: ; CHECK64: // %bb.0: // %entry -; CHECK64-NEXT: sub sp, sp, #128 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: add x29, sp, #64 -; CHECK64-NEXT: stp x28, x27, [sp, #80] // 16-byte Folded Spill -; CHECK64-NEXT: stp x26, x20, [sp, #96] // 16-byte Folded Spill -; CHECK64-NEXT: str x19, [sp, #112] // 8-byte Folded Spill -; CHECK64-NEXT: addvl sp, sp, #-18 +; CHECK64-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK64-NEXT: str x28, [sp, #16] // 8-byte Folded Spill +; CHECK64-NEXT: mov x29, sp +; CHECK64-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK64-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK64-NEXT: addvl sp, sp, #-2 ; CHECK64-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill @@ -4330,41 +4601,43 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; CHECK64-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK64-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK64-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: sub sp, sp, #64 +; CHECK64-NEXT: addvl sp, sp, #-16 +; CHECK64-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK64-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK64-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill ; CHECK64-NEXT: sub sp, sp, #112 ; CHECK64-NEXT: addvl sp, sp, #-1 ; CHECK64-NEXT: mov x19, sp ; CHECK64-NEXT: .cfi_def_cfa w29, 64 -; CHECK64-NEXT: .cfi_offset w19, -16 -; CHECK64-NEXT: .cfi_offset w20, -24 -; CHECK64-NEXT: .cfi_offset w26, -32 -; CHECK64-NEXT: .cfi_offset w27, -40 +; CHECK64-NEXT: .cfi_offset w19, -8 +; CHECK64-NEXT: .cfi_offset w20, -16 +; CHECK64-NEXT: .cfi_offset w26, -24 +; CHECK64-NEXT: .cfi_offset w27, -32 ; CHECK64-NEXT: .cfi_offset w28, -48 ; CHECK64-NEXT: .cfi_offset w30, -56 ; CHECK64-NEXT: .cfi_offset w29, -64 -; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 128 -; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * VG - 128 +; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * VG - 128 ; CHECK64-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK64-NEXT: ubfiz x8, x0, #2, #32 ; CHECK64-NEXT: mov x9, sp @@ -4385,22 +4658,23 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; CHECK64-NEXT: sub x8, x29, #64 ; CHECK64-NEXT: movk w0, #59491, lsl #16 ; CHECK64-NEXT: addvl sp, x8, #-18 -; CHECK64-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK64-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK64-NEXT: addvl sp, x29, #-2 ; CHECK64-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload @@ -4413,131 +4687,243 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 % ; CHECK64-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload ; CHECK64-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: sub sp, x29, #64 -; CHECK64-NEXT: ldp x20, x19, [sp, #104] // 16-byte Folded Reload -; CHECK64-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload -; CHECK64-NEXT: ldp x27, x26, [sp, #88] // 16-byte Folded Reload -; CHECK64-NEXT: ldp x30, x28, [sp, #72] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #128 +; CHECK64-NEXT: mov sp, x29 +; CHECK64-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK64-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload +; CHECK64-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK64-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call_dynamic_and_scalable_alloca: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x20, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1072 -; CHECK1024-NEXT: addvl sp, sp, #-1 -; CHECK1024-NEXT: mov x19, sp -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -16 -; CHECK1024-NEXT: .cfi_offset w20, -24 -; CHECK1024-NEXT: .cfi_offset w26, -32 -; CHECK1024-NEXT: .cfi_offset w27, -40 -; CHECK1024-NEXT: .cfi_offset w28, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088 -; CHECK1024-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK1024-NEXT: ubfiz x8, x0, #2, #32 -; CHECK1024-NEXT: mov x9, sp -; CHECK1024-NEXT: add x8, x8, #15 -; CHECK1024-NEXT: and x8, x8, #0x7fffffff0 -; CHECK1024-NEXT: sub x20, x9, x8 -; CHECK1024-NEXT: mov sp, x20 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: add x0, x19, #8 -; CHECK1024-NEXT: bl bar -; CHECK1024-NEXT: sub x0, x29, #1024 -; CHECK1024-NEXT: addvl x0, x0, #-19 -; CHECK1024-NEXT: bl bar -; CHECK1024-NEXT: mov x0, x20 -; CHECK1024-NEXT: bl bar -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: sub x8, x29, #1024 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: addvl sp, x8, #-18 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: sub sp, x29, #1024 -; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x20, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x20, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1072 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-NOSPLITSVE-NEXT: mov x19, sp +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w20, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK1024-NOSPLITSVE-NEXT: ubfiz x8, x0, #2, #32 +; CHECK1024-NOSPLITSVE-NEXT: mov x9, sp +; CHECK1024-NOSPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-NOSPLITSVE-NEXT: and x8, x8, #0x7fffffff0 +; CHECK1024-NOSPLITSVE-NEXT: sub x20, x9, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov sp, x20 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: add x0, x19, #8 +; CHECK1024-NOSPLITSVE-NEXT: bl bar +; CHECK1024-NOSPLITSVE-NEXT: sub x0, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: addvl x0, x0, #-19 +; CHECK1024-NOSPLITSVE-NEXT: bl bar +; CHECK1024-NOSPLITSVE-NEXT: mov x0, x20 +; CHECK1024-NOSPLITSVE-NEXT: bl bar +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: sub sp, x29, #1024 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x20, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str x28, [sp, #16] // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: stp x27, x26, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1072 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-1 +; CHECK1024-SPLITSVE-NEXT: mov x19, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w20, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * VG - 1088 +; CHECK1024-SPLITSVE-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK1024-SPLITSVE-NEXT: ubfiz x8, x0, #2, #32 +; CHECK1024-SPLITSVE-NEXT: mov x9, sp +; CHECK1024-SPLITSVE-NEXT: add x8, x8, #15 +; CHECK1024-SPLITSVE-NEXT: and x8, x8, #0x7fffffff0 +; CHECK1024-SPLITSVE-NEXT: sub x20, x9, x8 +; CHECK1024-SPLITSVE-NEXT: mov sp, x20 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: add x0, x19, #8 +; CHECK1024-SPLITSVE-NEXT: bl bar +; CHECK1024-SPLITSVE-NEXT: sub x0, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl x0, x0, #-19 +; CHECK1024-SPLITSVE-NEXT: bl bar +; CHECK1024-SPLITSVE-NEXT: mov x0, x20 +; CHECK1024-SPLITSVE-NEXT: bl bar +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: sub x8, x29, #1024 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: addvl sp, x8, #-18 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, x29, #-2 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: mov sp, x29 +; CHECK1024-SPLITSVE-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x27, x26, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ret entry: %a = alloca i32, i32 10 %b = alloca <vscale x 4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index e411c23c77bbe..7b5621ff3b5a9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -27,11 +27,11 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 -; GCN-NEXT: v_mov_b32_e32 v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) @@ -68,12 +68,12 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, v1 +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index e86f7473363f7..37b5422be7e2f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s ; Note: we use MIR test checks + stop after legalizer to prevent ; tests from being optimized out. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll index 44b12a9f6fe81..61a61376d7ddd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s declare void @readsMem(ptr) #0 declare void @writesMem(ptr) #1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll index 1cd9c0bfeb7e6..2351c969d5e49 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -165,10 +165,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v3, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -179,15 +179,15 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v3, v0, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 637aaf7529364..7f10ee4c17450 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -546,10 +546,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GCN-NEXT: v_mov_b32_e32 v5, v2 +; GCN-NEXT: v_mov_b32_e32 v6, v1 +; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GCN-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i64: @@ -742,10 +743,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 ; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1] ; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0 -; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9] -; GCN-NEXT: v_mov_b32_e32 v2, v8 -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2] -; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v3, v[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, v10 +; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v4, v[1:2] +; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9] ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i96: @@ -758,8 +759,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0 ; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9] ; GFX10-NEXT: v_mov_b32_e32 v2, v8 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v4, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[8:9] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_mul_i96: @@ -771,8 +772,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0 ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9] ; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v6, v4, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[8:9] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_mul_i96: @@ -791,8 +792,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9] ; GFX12-NEXT: v_mov_b32_e32 v2, v8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v6, v4, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[8:9] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX1250-LABEL: v_mul_i96: @@ -808,10 +809,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5 ; GFX1250-NEXT: v_mov_b32_e32 v8, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9] -; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 +; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = mul i96 %num, %den ret i96 %result @@ -1071,18 +1072,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX7-NEXT: v_mov_b32_e32 v9, v1 ; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX7-NEXT: v_mov_b32_e32 v10, v2 +; GFX7-NEXT: v_mov_b32_e32 v11, v3 +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v12, v4 +; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] ; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX7-NEXT: v_mov_b32_e32 v2, v11 -; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX7-NEXT: v_mov_b32_e32 v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] +; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] +; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i128: @@ -1092,18 +1095,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX8-NEXT: v_mov_b32_e32 v9, v1 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, v2 +; GFX8-NEXT: v_mov_b32_e32 v11, v3 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v12, v4 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] ; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX8-NEXT: v_mov_b32_e32 v2, v11 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX8-NEXT: v_mov_b32_e32 v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] +; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] +; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i128: @@ -1113,18 +1118,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX9-NEXT: v_mov_b32_e32 v9, v1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 ; GFX9-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3] ; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] ; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6 -; GFX9-NEXT: v_mov_b32_e32 v2, v11 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v14, v7, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i128: @@ -1138,11 +1145,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 -; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v2, v11 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4 +; GFX10-NEXT: v_mad_u64_u32 v[13:14], s4, v10, v4, v[11:12] +; GFX10-NEXT: v_mov_b32_e32 v2, v13 +; GFX10-NEXT: v_mad_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[11:12] +; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v14, v7, s4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo ; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7] ; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6] @@ -1155,15 +1162,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX11-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4 ; GFX11-NEXT: v_mov_b32_e32 v12, v3 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0 -; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6 -; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7 +; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7 +; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6 ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0 -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v10, v11, v[2:3] +; GFX11-NEXT: v_mov_b32_e32 v2, v13 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], vcc_lo, v8, v5, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[3:4] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v7, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4] ; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1184,14 +1192,14 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v10, v4, v[11:12] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mov_b32_e32 v2, v11 -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] +; GFX12-NEXT: v_mov_b32_e32 v2, v13 +; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[11:12] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0 +; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v14, v7, s0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo @@ -1210,16 +1218,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1] ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11] -; GFX1250-NEXT: v_mov_b32_e32 v12, v1 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11] +; GFX1250-NEXT: v_mov_b32_e32 v10, v1 ; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mov_b32_e32 v13, v10 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13] +; GFX1250-NEXT: v_mov_b32_e32 v11, v12 +; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11] ; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13] -; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v13, v8, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo ; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1 @@ -2401,207 +2409,216 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX7-NEXT: v_mul_lo_u32 v28, v3, v12 +; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] ; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc -; GFX7-NEXT: v_mov_b32_e32 v22, v18 -; GFX7-NEXT: v_mov_b32_e32 v18, v19 -; GFX7-NEXT: v_mov_b32_e32 v19, v16 -; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] +; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] +; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] +; GFX7-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc +; GFX7-NEXT: v_mov_b32_e32 v21, v22 +; GFX7-NEXT: v_mov_b32_e32 v22, v23 +; GFX7-NEXT: v_mov_b32_e32 v23, v18 +; GFX7-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] +; GFX7-NEXT: v_mul_lo_u32 v18, v6, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX7-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] +; GFX7-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] +; GFX7-NEXT: v_mov_b32_e32 v20, v23 +; GFX7-NEXT: v_mul_lo_u32 v25, v4, v11 +; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] +; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] +; GFX7-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] ; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX7-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX7-NEXT: v_mov_b32_e32 v21, v20 -; GFX7-NEXT: v_mov_b32_e32 v20, v11 -; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX7-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11] +; GFX7-NEXT: v_mov_b32_e32 v12, v22 +; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] +; GFX7-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] +; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] +; GFX7-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] ; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] +; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] +; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] ; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] ; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] +; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] +; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] +; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc +; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v0, v10 +; GFX7-NEXT: v_mov_b32_e32 v1, v13 +; GFX7-NEXT: v_mov_b32_e32 v2, v14 +; GFX7-NEXT: v_mov_b32_e32 v7, v11 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_i256: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX8-NEXT: v_mul_lo_u32 v28, v3, v12 +; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] ; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc -; GFX8-NEXT: v_mov_b32_e32 v22, v18 -; GFX8-NEXT: v_mov_b32_e32 v18, v19 -; GFX8-NEXT: v_mov_b32_e32 v19, v16 -; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] +; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] +; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] +; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc +; GFX8-NEXT: v_mov_b32_e32 v21, v22 +; GFX8-NEXT: v_mov_b32_e32 v22, v23 +; GFX8-NEXT: v_mov_b32_e32 v23, v18 +; GFX8-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] +; GFX8-NEXT: v_mul_lo_u32 v18, v6, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX8-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] +; GFX8-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] +; GFX8-NEXT: v_mov_b32_e32 v20, v23 +; GFX8-NEXT: v_mul_lo_u32 v25, v4, v11 +; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] +; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] +; GFX8-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] ; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX8-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX8-NEXT: v_mov_b32_e32 v21, v20 -; GFX8-NEXT: v_mov_b32_e32 v20, v11 -; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX8-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v12, v22 +; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] +; GFX8-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11] +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] +; GFX8-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11] ; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] +; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] +; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11] ; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11] ; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc -; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13] +; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9] +; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7] +; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc +; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, v10 +; GFX8-NEXT: v_mov_b32_e32 v1, v13 +; GFX8-NEXT: v_mov_b32_e32 v2, v14 +; GFX8-NEXT: v_mov_b32_e32 v7, v11 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mul_i256: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] +; GFX9-NEXT: v_mul_lo_u32 v28, v3, v12 +; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0 +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21] ; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0 -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] -; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v22, vcc -; GFX9-NEXT: v_mov_b32_e32 v22, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v16 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23] +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc +; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19] +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21] +; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v24, vcc +; GFX9-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-NEXT: v_mov_b32_e32 v22, v23 +; GFX9-NEXT: v_mov_b32_e32 v23, v18 +; GFX9-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23] +; GFX9-NEXT: v_mul_lo_u32 v18, v6, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19] -; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11 -; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12 -; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22] -; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10 -; GFX9-NEXT: v_mul_lo_u32 v28, v2, v13 -; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12] +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v6, s[4:5] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25] +; GFX9-NEXT: v_mov_b32_e32 v20, v23 +; GFX9-NEXT: v_mul_lo_u32 v25, v4, v11 +; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24] +; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e64 v24, s[10:11], 0, v23, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12] ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 -; GFX9-NEXT: v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19] -; GFX9-NEXT: v_mov_b32_e32 v21, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v11 -; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19] -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v12, v22 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13 +; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12] +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[10:11], 0, v24, s[10:11] +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17] +; GFX9-NEXT: v_addc_co_u32_e64 v16, s[10:11], 0, v13, s[10:11] ; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21] -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17] +; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23] +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v4, s[10:11] ; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v6, v5, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v16, v11, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v26, v12, s[10:11] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v19, v0, s[10:11] ; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7] -; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5] -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v2, s[12:13] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v28, s[8:9] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v25, s[6:7] +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v18, vcc +; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-NEXT: v_mov_b32_e32 v7, v11 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_mul_i256: @@ -2609,68 +2626,69 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v16, v0 ; GFX10-NEXT: v_mov_b32_e32 v17, v1 -; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX10-NEXT: v_mul_lo_u32 v29, v4, v11 +; GFX10-NEXT: v_mul_lo_u32 v31, v3, v12 +; GFX10-NEXT: v_mul_lo_u32 v30, v2, v13 ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0 -; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19] -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0 -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] -; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21] -; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 -; GFX10-NEXT: v_mov_b32_e32 v20, v22 -; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] -; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo -; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20] -; GFX10-NEXT: v_mov_b32_e32 v20, v18 +; GFX10-NEXT: v_mul_lo_u32 v28, v17, v14 +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v13, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v12, 0 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v12, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4 +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] +; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v4, v10, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] +; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v22, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v5, v9, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v10, 0 +; GFX10-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v8, v[20:21] +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[18:19] +; GFX10-NEXT: v_mov_b32_e32 v18, v23 +; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4 +; GFX10-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX10-NEXT: v_mov_b32_e32 v19, v24 +; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10 +; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v27, vcc_lo +; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[18:19] ; GFX10-NEXT: v_mov_b32_e32 v19, v22 -; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] -; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20] +; GFX10-NEXT: v_mul_lo_u32 v27, v16, v15 +; GFX10-NEXT: v_mov_b32_e32 v18, v21 +; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] +; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[18:19] ; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 -; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25] -; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s6 +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[21:22] ; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13 -; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19] ; GFX10-NEXT: v_mov_b32_e32 v13, v1 -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12] -; GFX10-NEXT: v_mov_b32_e32 v14, v21 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 -; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19] -; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14] -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 -; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2] -; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8 -; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11] -; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13] -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7 -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo -; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v32, s6 +; GFX10-NEXT: v_mov_b32_e32 v14, v20 +; GFX10-NEXT: v_mad_u64_u32 v[21:22], s7, v3, v10, v[18:19] +; GFX10-NEXT: v_mad_u64_u32 v[18:19], s6, v2, v9, v[11:12] +; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v1, s6 +; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[13:14] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v4, v9, v[21:22] +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8 +; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v3, v8, v[18:19] +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, 0, v15, s8 +; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v5, v8, v[1:2] +; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11] +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v4, v12, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v6, v13, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v9, v14, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v15, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v25, v27, s9 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v28, s8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v30, s6 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v31, s7 +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v29, s5 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v24, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v23, s4 ; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -2681,66 +2699,65 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7 ; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v12, 0 +; GFX11-NEXT: v_mul_lo_u32 v28, v16, v15 ; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14 -; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8] -; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8] -; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX11-NEXT: v_mov_b32_e32 v20, v8 -; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, 1, s0 -; GFX11-NEXT: v_mov_b32_e32 v21, v22 -; GFX11-NEXT: v_mul_lo_u32 v22, v6, v9 -; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1] +; GFX11-NEXT: v_mul_lo_u32 v32, v3, v12 +; GFX11-NEXT: v_mul_lo_u32 v31, v2, v13 +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v17, v13, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v12, 0 +; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v2, v12, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[20:21] +; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[7:8] +; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v10, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] +; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v5, v9, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v18, v[0:1] ; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo -; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21] -; GFX11-NEXT: v_mov_b32_e32 v6, v25 -; GFX11-NEXT: v_mul_lo_u32 v25, v16, v15 -; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v6, v18, v[20:21] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[7:8] +; GFX11-NEXT: v_mov_b32_e32 v7, v23 +; GFX11-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 +; GFX11-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX11-NEXT: v_mov_b32_e32 v8, v24 +; GFX11-NEXT: v_mul_lo_u32 v24, v5, v10 +; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v18, v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[7:8] +; GFX11-NEXT: v_dual_mov_b32 v7, v22 :: v_dual_mov_b32 v6, v21 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[6:7] ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2 -; GFX11-NEXT: v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21] -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7] -; GFX11-NEXT: v_mul_lo_u32 v20, v2, v13 -; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, s2 -; GFX11-NEXT: v_mov_b32_e32 v11, v1 -; GFX11-NEXT: v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7] -; GFX11-NEXT: v_mul_lo_u32 v21, v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v12, v24 -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, s2 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14] -; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12] -; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 -; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2] -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s4 -; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7] -; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9] -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v11, v3, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v26, v4, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v10, v5, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v27, v6, s5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v23, v25, s5 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s1, v2, v11, v[21:22] +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15] +; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v8, s2 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], s3, v3, v10, v[6:7] +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v2, v9, v[11:12] +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v1, s2 +; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[13:14] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v9, v[21:22] +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v18, v[6:7] +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v12, s4 +; GFX11-NEXT: v_mad_u64_u32 v[6:7], s4, v5, v18, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[10:11] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v8, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v27, v9, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v12, v6, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v7, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v25, v28, s5 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v20, s2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v21, s3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v32, s3 ; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v22, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v23, s0 ; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2752,101 +2769,103 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 -; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mul_lo_u32 v29, v4, v11 +; GFX12-NEXT: v_mul_lo_u32 v31, v3, v12 +; GFX12-NEXT: v_mul_lo_u32 v30, v2, v13 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 -; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] +; GFX12-NEXT: v_mul_lo_u32 v28, v17, v14 +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v17, v13, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v12, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v2, v12, v[18:19] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[20:21] +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 -; GFX12-NEXT: v_mov_b32_e32 v20, v22 +; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v4, v10, v[0:1] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] +; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v5, v9, v[18:19] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v10, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], null, v6, v8, v[20:21] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_mov_b32_e32 v18, v23 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0 +; GFX12-NEXT: v_mul_lo_u32 v23, v6, v9 +; GFX12-NEXT: v_mov_b32_e32 v19, v24 +; GFX12-NEXT: v_mul_lo_u32 v24, v5, v10 +; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1] +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v27, vcc_lo +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[18:19] ; GFX12-NEXT: v_mov_b32_e32 v19, v22 -; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] +; GFX12-NEXT: v_mul_lo_u32 v27, v16, v15 +; GFX12-NEXT: v_mov_b32_e32 v18, v21 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1] +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[18:19] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 -; GFX12-NEXT: v_mov_b32_e32 v20, v18 -; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25] -; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11 -; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 -; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13 +; GFX12-NEXT: v_cndmask_b32_e64 v32, 0, 1, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[21:22] ; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] -; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2 -; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21 +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v32, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s3, v3, v10, v[18:19] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] +; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s2, v2, v9, v[11:12] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2 -; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19] -; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14] +; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, s2 +; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[13:14] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v4, v9, v[21:22] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2] +; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v3, v8, v[18:19] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v6, s4 -; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] -; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v15, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v5, v8, v[1:2] +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11] ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v12, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v6, v13, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v9, v14, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v15, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v23, v22, s5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s4 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v25, v27, s5 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3 -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s2 +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v31, s3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v29, s1 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s0 ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] ; @@ -2855,87 +2874,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 -; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10 -; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mul_lo_u32 v30, v4, v11 +; GFX1250-NEXT: v_mul_lo_u32 v29, v5, v10 +; GFX1250-NEXT: v_mul_lo_u32 v31, v3, v12 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v12, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] -; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo -; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v16, v10, 0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] -; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo +; GFX1250-NEXT: v_mul_lo_u32 v32, v2, v13 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v12, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v4, v10, v[0:1] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo -; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] -; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22 -; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9 -; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo +; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v10, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1] -; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21] +; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24 +; GFX1250-NEXT: v_mul_lo_u32 v24, v6, v9 +; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0 ; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18 -; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22 +; GFX1250-NEXT: v_mov_b32_e32 v13, v18 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21] ; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1] +; GFX1250-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2 ; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21] -; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19] -; GFX1250-NEXT: v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24 -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11] -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v21, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v11, s2 +; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mov_b32_e32 v12, v1 +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13] +; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23] +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v33, s2 ; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15 -; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13] -; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14 -; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11] +; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11] +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2 -; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19] +; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v28, v11, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v23, v2, s2 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v25, v2, s2 ; GFX1250-NEXT: v_mov_b32_e32 v2, v15 ; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v20, s4 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s3 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v32, s4 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v31, s3 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v25, s1 -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v27, s0 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v30, s1 +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo +; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v24, vcc_lo ; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v14 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] @@ -2949,60 +2970,60 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 -; GFX7-NEXT: buffer_load_dword v2, v[2:3], s[0:3], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0 +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_mul_u64_zext_with_vregs: ; GFX8: ; %bb.0: -; GFX8-NEXT: flat_load_dword v2, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX8-NEXT: flat_load_dword v4, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_zext_with_vregs: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0x50 +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, 0x50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_mul_u64_zext_with_vregs: ; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v2, v[2:3], off +; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0 +; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_mul_u64_zext_with_vregs: ; GFX11: ; %bb.0: -; GFX11-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-NEXT: global_load_b32 v4, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: s_mul_u64_zext_with_vregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v2, v[2:3], off +; GFX12-NEXT: global_load_b32 v4, v[2:3], off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: s_mul_u64_zext_with_vregs: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: global_load_b32 v2, v[2:3], off +; GFX1250-NEXT: global_load_b32 v4, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 @@ -3130,33 +3151,36 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 -; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4] +; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 +; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v3, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_mul_u64_sext_with_vregs: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v4, v[2:3] -; GFX8-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 +; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v3, v4 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_mul_u64_sext_with_vregs: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, 0x50 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x50 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0 +; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v3, v4 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm ; @@ -3183,17 +3207,17 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; ; GFX12-LABEL: s_mul_u64_sext_with_vregs: ; GFX12: ; %bb.0: -; GFX12-NEXT: global_load_b32 v2, v[2:3], off +; GFX12-NEXT: global_load_b32 v4, v[2:3], off ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0 +; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX12-NEXT: s_endpgm ; ; GFX1250-LABEL: s_mul_u64_sext_with_vregs: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: global_load_b32 v2, v[2:3], off +; GFX1250-NEXT: global_load_b32 v4, v[2:3], off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0 +; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 4f2c454e13356..01c601f0646b5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -31,128 +31,128 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 ; CHECK-NEXT: v_trunc_f32_e32 v8, v6 ; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8 -; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3 +; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v8 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7 -; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 +; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v3 +; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0 ; CHECK-NEXT: v_mov_b32_e32 v3, v7 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10 -; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc -; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10 -; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7 -; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10 -; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6 +; CHECK-NEXT: v_mul_lo_u32 v5, v11, v9 +; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v11, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7 +; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7 +; CHECK-NEXT: v_mul_hi_u32 v5, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3 -; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v8, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -220,65 +220,65 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s13, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s12, v1 ; CHECK-NEXT: v_mul_hi_u32 v4, s12, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, s13, v0 -; CHECK-NEXT: v_mul_hi_u32 v5, s13, v1 +; CHECK-NEXT: v_mov_b32_e32 v7, s13 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -291,39 +291,39 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v4, s13, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v5, s13 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v3, s11 -; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s13, v1 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; CHECK-NEXT: v_mov_b32_e32 v1, s11 +; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s13, v4 ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s11, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v2, v4, v5, s[0:1] +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s10, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s11, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v3 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] ; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 ; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 @@ -382,263 +382,263 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v4, vcc ; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 ; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v9, v[5:6] ; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11 ; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v9, v14 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v14 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v5 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GISEL-NEXT: v_mov_b32_e32 v5, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[5:6] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 +; GISEL-NEXT: v_mul_lo_u32 v5, v16, v14 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12 +; GISEL-NEXT: v_mul_hi_u32 v5, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v5 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[1:2] ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13] ; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5 ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v0, v[12:13] ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 +; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v4, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 +; GISEL-NEXT: v_trunc_f32_e32 v15, v11 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 ; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7 ; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v10 +; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v10 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v4 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12 +; GISEL-NEXT: v_mul_lo_u32 v10, v18, v14 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 ; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v4 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v15, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v12, v22, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v18, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v11, vcc ; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11 -; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v13, vcc +; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v11, v1, v13 +; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[8:9] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v15 +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10 ; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11 +; GISEL-NEXT: v_xor_b32_e32 v17, v2, v15 ; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8 +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v17, v1 +; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v4, v13 +; GISEL-NEXT: v_mul_hi_u32 v4, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v1, v2 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4] -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v13 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v10, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v8, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6 +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10 +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v11, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v15, v5 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -667,100 +667,100 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v3, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; CGP-NEXT: v_trunc_f32_e32 v5, v4 ; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v18, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v5 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] +; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 +; CGP-NEXT: v_mul_lo_u32 v18, v17, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v14, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v16, v12, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5] -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 -; CGP-NEXT: v_mul_lo_u32 v5, v15, v3 -; CGP-NEXT: v_mul_lo_u32 v14, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v3, v15, v3 -; CGP-NEXT: v_xor_b32_e32 v10, v10, v13 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v3 +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13] +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v15 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v5, v15 +; CGP-NEXT: v_mul_lo_u32 v5, v17, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v14, v4 +; CGP-NEXT: v_xor_b32_e32 v13, v10, v15 +; CGP-NEXT: v_mul_hi_u32 v10, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v17, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v15, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_mul_hi_u32 v14, v12, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v17, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v10, v3 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v14, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v10, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v11, v4 ; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v11, v12, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v13, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v4 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v11, v13, v4 ; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5] -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v16, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v12, v3 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 ; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] @@ -771,13 +771,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5] ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc +; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v16, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10 ; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 @@ -785,8 +785,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v3, v13, v0 -; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v15, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -840,126 +840,126 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v5, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CGP-NEXT: v_trunc_f32_e32 v7, v6 ; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v16, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v16, v15, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v12, v6 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7] -; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v5 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v6, vcc +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7] +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11] +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v13 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v9, v12, v6 +; CGP-NEXT: v_xor_b32_e32 v14, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v13, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CGP-NEXT: v_mul_hi_u32 v12, v10, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v8, v15, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v6, v15, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v8, v5 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v12, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v8, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v15, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v14, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v5 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v14, v6 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v6 ; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v15, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v14, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v14, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v15, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v11, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v13, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -1049,82 +1049,82 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1133,40 +1133,40 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v1, -1, v3, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v7 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v4, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 1235195 ret i64 %result @@ -1215,46 +1215,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5] ; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v14, v18, v16 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14 -; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v14, v7, v16 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v18, v16 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v17 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v14 +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v15 +; GISEL-NEXT: v_mul_hi_u32 v13, v7, v16 ; GISEL-NEXT: v_xor_b32_e32 v19, v1, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v14, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 ; GISEL-NEXT: v_mov_b32_e32 v7, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 @@ -1263,46 +1263,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v18, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 +; GISEL-NEXT: v_add_i32_e32 v20, vcc, v15, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v20, v[1:2] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v17, v0 +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v15, vcc +; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v15 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v18 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v20, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v16 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 ; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v16, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1319,74 +1319,74 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v20, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v9, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v9, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v8, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v9, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v8, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v6, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v9, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 @@ -1394,8 +1394,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 @@ -1406,12 +1406,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_oddk_denom: @@ -1430,112 +1430,112 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 ; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] ; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] ; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 +; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 +; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 +; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v17, v13, vcc +; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v17, v13 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc +; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v16 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v18, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc +; CGP-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] +; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v20, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc +; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v19, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc +; CGP-NEXT: v_cndmask_b32_e32 v14, v17, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 @@ -1553,72 +1553,72 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v12, v18, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -1626,24 +1626,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195> ret <2 x i64> %result @@ -1679,126 +1679,126 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 ; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 +; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CHECK-NEXT: v_trunc_f32_e32 v7, v6 ; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v7 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] +; CHECK-NEXT: v_mul_lo_u32 v6, v14, v5 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v5 +; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7] +; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 +; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc +; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12 +; CHECK-NEXT: v_mul_lo_u32 v3, v14, v5 +; CHECK-NEXT: v_mul_lo_u32 v6, v11, v9 +; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v4, v11, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc -; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3 -; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4 +; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4 +; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5] -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5] +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6] +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 ; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8 -; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5] +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 @@ -1850,8 +1850,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v8 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v8 +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v5, vcc ; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10 @@ -1859,182 +1859,183 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7 ; GISEL-NEXT: v_trunc_f32_e32 v13, v11 ; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v7 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] -; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v7, v19, v11 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7 -; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v7 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v11, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v7 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0 ; GISEL-NEXT: v_mov_b32_e32 v7, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8] ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7 -; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12 -; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 +; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11 +; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14 +; GISEL-NEXT: v_xor_b32_e32 v18, v1, v7 +; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12 +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v19, v14 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0 +; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 ; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1 +; GISEL-NEXT: v_mul_lo_u32 v13, v18, v1 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1 +; GISEL-NEXT: v_mul_hi_u32 v12, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 +; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1 ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13 +; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v13 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[1:2] ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6 ; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13] ; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6 ; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v5, v0, v[12:13] ; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11 -; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12 -; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9 +; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11 +; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14 +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc +; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v5, vcc ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v16, v11 -; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16 +; GISEL-NEXT: v_trunc_f32_e32 v15, v11 +; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15 ; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1 ; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10 ; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8 -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc +; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v8 +; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8 ; GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13] -; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v8 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v5 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12 +; GISEL-NEXT: v_mul_lo_u32 v8, v18, v14 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 ; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] +; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11 ; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v15, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v17, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v22, v14 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; GISEL-NEXT: v_mul_hi_u32 v13, v18, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 +; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v12, vcc ; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1] -; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12 -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v14, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v12, v1, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc +; GISEL-NEXT: v_xor_b32_e32 v5, v1, v15 +; GISEL-NEXT: v_mul_lo_u32 v1, v14, v11 ; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15 ; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11 -; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v4, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -2042,25 +2043,25 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc +; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc ; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1 -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v3, v5, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v5, v1 ; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1 -; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3 @@ -2074,39 +2075,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v7 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[3:4] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v7, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[4:5] ; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11 ; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v5 ; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc -; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc +; GISEL-NEXT: v_xor_b32_e32 v4, v15, v6 ; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4 ; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 @@ -2138,126 +2138,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v10, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11 ; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10 ; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 ; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 ; CGP-NEXT: v_trunc_f32_e32 v12, v11 ; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v17, v13, v10 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_mul_lo_u32 v12, v16, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v18, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v19, v16, v11 -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17 -; CGP-NEXT: v_mul_hi_u32 v17, v13, v11 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v19, v10 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v19, v12 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] +; CGP-NEXT: v_mul_lo_u32 v11, v19, v10 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; CGP-NEXT: v_mul_hi_u32 v12, v16, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v16, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v19, v14 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v16, v14 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v10 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v8, v14 -; CGP-NEXT: v_mul_lo_u32 v8, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v15, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v17, v9, v14 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; CGP-NEXT: v_mul_hi_u32 v13, v19, v14 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v10 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12] +; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13] +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc +; CGP-NEXT: v_xor_b32_e32 v15, v8, v17 +; CGP-NEXT: v_mul_lo_u32 v8, v19, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v16, v14 +; CGP-NEXT: v_xor_b32_e32 v18, v9, v17 +; CGP-NEXT: v_mul_hi_u32 v9, v16, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v19, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v16, v11 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v9, v19, v14 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v16, v14 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v19, v14 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v17, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v17, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v17, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v19, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v18, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v15, v9 +; CGP-NEXT: v_mul_hi_u32 v12, v15, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v18, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v17, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v18, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v11, v15, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10 +; CGP-NEXT: v_mul_hi_u32 v12, v18, v9 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10] -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v16, vcc, v12, v10 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10] +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v15, v8 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11] +; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v18, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v18, v12 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1 -; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5] -; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v1 +; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc +; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v12, s[4:5] +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v14 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v16, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1 +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v12, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v14, v0 -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v8, v17, v0 +; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v1, v8 ; CGP-NEXT: v_xor_b32_e32 v1, v4, v8 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 @@ -2313,128 +2313,128 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v3, v6, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 ; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 ; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 ; CGP-NEXT: v_trunc_f32_e32 v10, v8 ; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_mul_lo_u32 v6, v14, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v16, v14, v9 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] +; CGP-NEXT: v_mul_lo_u32 v6, v16, v8 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 +; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v16, v11 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v6 +; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v8, vcc +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v6, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v10, v5, v12 -; CGP-NEXT: v_mul_lo_u32 v5, v14, v8 -; CGP-NEXT: v_mul_lo_u32 v7, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v6, v12 -; CGP-NEXT: v_mul_hi_u32 v6, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 +; CGP-NEXT: v_mul_lo_u32 v5, v16, v8 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 +; CGP-NEXT: v_xor_b32_e32 v15, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v6, v13, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v16, v8 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v14, v9 +; CGP-NEXT: v_mul_lo_u32 v6, v16, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v11, v9 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v14, v9 +; CGP-NEXT: v_mul_hi_u32 v8, v16, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v8, v10, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v10, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v13, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v13, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v16, v6, vcc +; CGP-NEXT: v_mul_lo_u32 v7, v15, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v6 +; CGP-NEXT: v_mul_hi_u32 v9, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v15, v5 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v15, v6 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_mul_hi_u32 v8, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v9, v15, v6 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7] -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7] -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CGP-NEXT: v_add_i32_e32 v13, vcc, v9, v7 +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7] +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v11, v[7:8] +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 ; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5] -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9 -; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5] +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v12, v2 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v14, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc ; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 ; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 @@ -2504,15 +2504,15 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v1, v4 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 @@ -2537,198 +2537,198 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v10, 0 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GISEL-NEXT: v_trunc_f32_e32 v5, v4 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4 -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 -; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v13, 0xffffff, v2 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0 +; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v3 ; GISEL-NEXT: v_mov_b32_e32 v5, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6] -; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6] -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, v[5:6] +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v10 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v12 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v7 ; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2 ; GISEL-NEXT: v_trunc_f32_e32 v8, v6 ; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3 -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v2 +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v3 +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v8 ; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 ; GISEL-NEXT: v_mov_b32_e32 v2, v7 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v17, v[2:3] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6 -; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7 -; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, v[7:8] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v2, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v2, v17, v6 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v9 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v17, v9 +; GISEL-NEXT: v_mul_hi_u32 v6, v17, v6 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v9 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v17, v9 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v2 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v17, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v12, 0 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v11, v1 ; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3] -; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[2:3] +; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[6:7] ; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1 -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5 +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v5 +; GISEL-NEXT: v_mul_lo_u32 v7, v12, v8 +; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc +; GISEL-NEXT: v_mul_hi_u32 v9, v12, v5 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 ; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v14, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v8, v14, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v14, v6, vcc ; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v8, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v12, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v13, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v5, v6 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v12, 0 +; GISEL-NEXT: v_mul_hi_u32 v14, 0, v7 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc ; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v14, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v11, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v12, v[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5 -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v5 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3 ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -2736,8 +2736,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v12 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v14, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2748,8 +2748,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_24bit: @@ -2769,27 +2769,27 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v5, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v1 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v3 ; CGP-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v2 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v5, v3 -; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v6, v1 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0 -; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v2 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v8, v3 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v1 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v6, 0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v7 -; CGP-NEXT: v_mul_lo_u32 v5, v2, v4 +; CGP-NEXT: v_mul_lo_u32 v6, v2, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v5 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v6 ; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 1441591a5fcce..f4489c2239fda 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -175,65 +175,65 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -246,36 +246,36 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s11 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 -; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v8, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v4 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0 -; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0 +; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v3, vcc +; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v5 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc @@ -283,20 +283,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v10, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v0, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_xor_b32_e32 v1, s1, v6 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GFX8-NEXT: v_xor_b32_e32 v3, s2, v5 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_xor_b32_e32 v4, s2, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3 @@ -312,6 +312,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-LABEL: sdivrem_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s17, 31 ; GFX9-NEXT: s_ashr_i32 s4, s19, 31 @@ -335,64 +336,63 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -400,67 +400,67 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2 -; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v6, v[2:3] +; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0 +; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v5 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] -; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s1, v6 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v3, s2, v6 -; GFX9-NEXT: v_xor_b32_e32 v5, s2, v2 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 +; GFX9-NEXT: v_xor_b32_e32 v3, s2, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15] +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v5, vcc +; GFX9-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_i64: @@ -1311,68 +1311,68 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] ; GFX8-NEXT: s_ashr_i32 s6, s19, 31 ; GFX8-NEXT: s_mov_b32 s7, s6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -1385,38 +1385,38 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v6, s11 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s10, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_ashr_i32 s10, s3, 31 -; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc +; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] -; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s8, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GFX8-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc +; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v7, s[0:1] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1 -; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v3 +; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v5, s[0:1] ; GFX8-NEXT: s_add_u32 s0, s18, s6 ; GFX8-NEXT: s_addc_u32 s1, s19, s6 ; GFX8-NEXT: s_add_u32 s2, s2, s10 @@ -1424,15 +1424,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: s_addc_u32 s3, s3, s10 ; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] ; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v4 ; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc ; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -1441,151 +1441,151 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0 ; GFX8-NEXT: s_sub_u32 s5, 0, s2 -; GFX8-NEXT: s_subb_u32 s20, 0, s3 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v11, v[1:2] +; GFX8-NEXT: s_subb_u32 s20, 0, s3 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[0:1] +; GFX8-NEXT: v_mul_lo_u32 v1, v11, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] ; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0 -; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v5, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v8, v12, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v11, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v2, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, v11, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, s16, v6 ; GFX8-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v10, s17 +; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] +; GFX8-NEXT: v_xor_b32_e32 v5, s17, v7 +; GFX8-NEXT: v_mov_b32_e32 v6, s17 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1 -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4] -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc -; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, v8, v3 -; GFX8-NEXT: v_mul_hi_u32 v11, v8, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v5, v2 -; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v5, v6, vcc +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] +; GFX8-NEXT: v_mul_lo_u32 v4, v11, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, s4, v9 +; GFX8-NEXT: v_mul_lo_u32 v7, v10, v5 +; GFX8-NEXT: v_mul_hi_u32 v9, v10, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, v11, v2 +; GFX8-NEXT: v_xor_b32_e32 v6, s4, v8 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v11, v5, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_mul_hi_u32 v9, v8, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v9, v11, v5 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v7, v10, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9 -; GFX8-NEXT: v_mul_hi_u32 v3, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v10, s4 -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4 -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc -; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_mul_hi_u32 v5, v11, v5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v4, vcc +; GFX8-NEXT: v_mul_lo_u32 v9, s9, v2 +; GFX8-NEXT: v_mul_lo_u32 v10, s8, v7 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, s8, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v9, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, s9, v7 ; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3 +; GFX8-NEXT: v_mul_hi_u32 v8, s8, v7 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6 -; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4] -; GFX8-NEXT: v_mov_b32_e32 v10, s9 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v2, v3 +; GFX8-NEXT: v_mul_hi_u32 v7, s9, v7 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v7, v6 +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v12, s9 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 -; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7] +; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] +; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v12, v8, vcc +; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s9, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2 -; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s2, v2 +; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] -; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc -; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14 +; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v10 +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v14 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11 +; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v9 ; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] ; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX8-NEXT: v_mov_b32_e32 v6, s1 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc -; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9 -; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX8-NEXT: v_mov_b32_e32 v8, s6 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6 -; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc +; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; GFX8-NEXT: v_xor_b32_e32 v7, s6, v9 +; GFX8-NEXT: v_xor_b32_e32 v8, s6, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, s6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v7 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v8, v9, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -1622,66 +1622,67 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX9-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7] ; GFX9-NEXT: s_ashr_i32 s6, s19, 31 ; GFX9-NEXT: s_mov_b32 s7, s6 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s11, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -1693,51 +1694,50 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2] +; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: s_ashr_i32 s10, s3, 31 -; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc -; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v3, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s18, s6 ; GFX9-NEXT: s_addc_u32 s1, s19, s6 ; GFX9-NEXT: s_add_u32 s2, s2, s10 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s3, s3, s10 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v14 ; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v10 ; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -1747,31 +1747,31 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 ; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v12, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v13, vcc ; GFX9-NEXT: s_subb_u32 s20, 0, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v12, v[1:2] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v5, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v17, v[2:3] +; GFX9-NEXT: v_mul_lo_u32 v2, v12, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v17, v4 ; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v13, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1 +; GFX9-NEXT: v_mul_lo_u32 v10, v12, v4 +; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, v17, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 @@ -1779,119 +1779,119 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4 ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 -; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v2, vcc ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, s16, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v9, s17 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7 -; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1] +; GFX9-NEXT: v_xor_b32_e32 v8, s17, v5 +; GFX9-NEXT: v_mov_b32_e32 v12, s17 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4] +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, v11, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v12, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v11, v5 ; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 +; GFX9-NEXT: v_mul_hi_u32 v6, v10, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v11, v5 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3 +; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, s4, v9 ; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 -; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 -; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, s4 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 +; GFX9-NEXT: v_xor_b32_e32 v7, s4, v7 +; GFX9-NEXT: v_mov_b32_e32 v8, s4 ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 -; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4] -; GFX9-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-NEXT: v_add3_u32 v11, v6, v11, v12 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v8, vcc +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v12, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7] +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7] ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 -; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 +; GFX9-NEXT: v_sub_u32_e32 v7, s9, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s2, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11 +; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v9 ; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11] -; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6 +; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_mov_b32_e32 v7, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-NEXT: v_xor_b32_e32 v7, s6, v9 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX9-NEXT: v_mov_b32_e32 v8, s6 -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc +; GFX9-NEXT: v_xor_b32_e32 v8, s6, v6 +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v9, vcc ; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13] ; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 40b5db0a15447..6f42239cd191d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -31,128 +31,128 @@ define i64 @v_srem_i64(i64 %num, i64 %den) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v6, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v6 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9 -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v11 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v3, v11 +; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v10, v8 +; CHECK-NEXT: v_xor_b32_e32 v12, v4, v11 +; CHECK-NEXT: v_mul_hi_u32 v4, v10, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v4, v13, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -214,65 +214,65 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; CHECK-NEXT: v_trunc_f32_e32 v2, v1 ; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4 +; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2] +; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3] +; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2] -; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2] -; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1 ; CHECK-NEXT: v_mul_hi_u32 v4, s10, v0 ; CHECK-NEXT: v_mul_hi_u32 v0, s11, v0 -; CHECK-NEXT: v_mul_hi_u32 v5, s11, v1 +; CHECK-NEXT: v_mov_b32_e32 v7, s11 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -285,19 +285,19 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v4, s11, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v5, s11 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2] +; CHECK-NEXT: v_mov_b32_e32 v1, s9 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2] -; CHECK-NEXT: v_mov_b32_e32 v3, s9 -; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s11, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3] +; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s11, v4 +; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 ; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] @@ -372,84 +372,84 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v8, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v11, v9 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v11 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v15, v9 -; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v17, v9 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] +; GISEL-NEXT: v_mul_hi_u32 v11, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v4 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v9, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v10 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9 -; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10 -; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v9 +; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12 +; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10 +; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v17, v12 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v0 ; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1 ; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1 ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 @@ -457,148 +457,148 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v1 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0 -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10 -; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v0 +; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v7 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v9 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v9 ; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2] +; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v7 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 +; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v16 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2] ; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v14, v[9:10] ; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 -; GISEL-NEXT: v_trunc_f32_e32 v12, v10 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v1 +; GISEL-NEXT: v_trunc_f32_e32 v12, v9 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_mov_b32_e32 v0, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1] -; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9 -; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, 0 +; GISEL-NEXT: v_sub_i32_e32 v19, vcc, v13, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[0:1] +; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v15, v11, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[12:13] +; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v11, v18, v9 +; GISEL-NEXT: v_mul_lo_u32 v12, v14, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10 -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v14, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v19, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7] +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v19, v5 +; GISEL-NEXT: v_subbrev_u32_e64 v15, s[6:7], 0, v1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v8 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8 -; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v15, v8 +; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v12, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v1, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 +; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 +; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v1 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v18, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v10, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v12, v[8:9] ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v11, v2, v5 +; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v3, v13, v10 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 -; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 +; GISEL-NEXT: v_xor_b32_e32 v10, v14, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v9, v15, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v9, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 @@ -651,128 +651,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v1, v2, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CGP-NEXT: v_trunc_f32_e32 v4, v3 ; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v16, v5, v3 -; CGP-NEXT: v_mul_lo_u32 v17, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v3 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v17, v4 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] +; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 +; CGP-NEXT: v_mul_lo_u32 v13, v17, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v5, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4] -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v4, v12 -; CGP-NEXT: v_mul_lo_u32 v4, v14, v2 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v3 -; CGP-NEXT: v_mul_hi_u32 v2, v14, v2 -; CGP-NEXT: v_xor_b32_e32 v10, v10, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v14, v3 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v3, v14, v3 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v12 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v2 +; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v3, vcc +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4] +; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v15 +; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5] +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v15, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v3, v15 +; CGP-NEXT: v_mul_lo_u32 v3, v17, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v14, v12 +; CGP-NEXT: v_xor_b32_e32 v16, v4, v15 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v17, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v10, v2 -; CGP-NEXT: v_mul_lo_u32 v5, v11, v3 -; CGP-NEXT: v_mul_hi_u32 v13, v11, v2 -; CGP-NEXT: v_mul_hi_u32 v2, v10, v2 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v17, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_mul_hi_u32 v5, v14, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v17, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v17, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v16, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 +; CGP-NEXT: v_mul_hi_u32 v10, v13, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v16, v2 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v10, v16, v3 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v5, v13, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v10, v16, v3 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4] -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5] +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0 -; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc +; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v12 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v12 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v15 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v15 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v15 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v15, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 ; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: .LBB2_2: ; %Flow1 @@ -820,128 +820,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v5 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v5 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6 +; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] +; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 +; CGP-NEXT: v_mul_lo_u32 v11, v15, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6] -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v9, v6, v10 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v11, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v10 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v12, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_mul_hi_u32 v5, v12, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v10 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v4 +; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v5, vcc +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6] +; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v13 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7] +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v11, v5, v13 +; CGP-NEXT: v_mul_lo_u32 v5, v15, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v12, v10 +; CGP-NEXT: v_xor_b32_e32 v14, v6, v13 +; CGP-NEXT: v_mul_hi_u32 v6, v12, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v5 -; CGP-NEXT: v_mul_hi_u32 v11, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v15, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v7, v12, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v15, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc +; CGP-NEXT: v_mul_lo_u32 v6, v14, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v11, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v14, v4 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v8, v14, v5 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_hi_u32 v7, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v7, v11, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v4, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v14, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v14, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 -; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v13 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v13 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -977,82 +977,82 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xfffff000 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000 +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1060,39 +1060,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 4096 ret i64 %result @@ -1141,92 +1141,92 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7 +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1243,74 +1243,74 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1330,10 +1330,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_pow2k_denom: @@ -1352,110 +1352,110 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 ; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] ; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] ; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 +; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 +; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 @@ -1473,72 +1473,72 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1558,10 +1558,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, <i64 4096, i64 4096> ret <2 x i64> %result @@ -1573,82 +1573,82 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb ; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705 +; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705 ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v4, v3 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 +; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4] +; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5] +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9 +; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2 +; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6 +; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0 ; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 ; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 @@ -1656,39 +1656,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2] +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2] ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc -; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, v3, s[4:5] -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, 1235195 ret i64 %result @@ -1737,92 +1737,92 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v8, v4 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v7, vcc ; GISEL-NEXT: v_mov_b32_e32 v4, v14 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13 -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15] ; GISEL-NEXT: s_mov_b32 s6, 1 ; GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14 +; GISEL-NEXT: v_mul_lo_u32 v7, v18, v16 ; GISEL-NEXT: s_subb_u32 s6, 0, 0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v7, v18, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13 -; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14 +; GISEL-NEXT: v_mul_hi_u32 v7, v19, v13 +; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v7, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7 ; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14 -; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7 +; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16 +; GISEL-NEXT: v_xor_b32_e32 v20, v1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0 -; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1 -; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v20, v0 +; GISEL-NEXT: v_mul_lo_u32 v14, v17, v1 +; GISEL-NEXT: v_mul_hi_u32 v15, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v20, v0 ; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1 +; GISEL-NEXT: v_mul_lo_u32 v15, v20, v1 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1 +; GISEL-NEXT: v_mul_hi_u32 v14, v17, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13 -; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v0, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v20, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14] -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v13 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v17, v0 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v20, v15 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v20, v15, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v16, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14] +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; GISEL-NEXT: v_mul_lo_u32 v13, v8, v0 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc @@ -1839,74 +1839,74 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc -; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6] -; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10 -; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5 -; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v9, v0, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v17, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2] +; GISEL-NEXT: v_cndmask_b32_e32 v12, v16, v14, vcc +; GISEL-NEXT: v_xor_b32_e32 v1, v12, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9] +; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_xor_b32_e32 v9, v2, v12 +; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0 +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 +; GISEL-NEXT: v_xor_b32_e32 v14, v3, v12 +; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5 +; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2 -; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v5, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v8, v13, v7 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2 +; GISEL-NEXT: v_mul_lo_u32 v6, v14, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2 +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 +; GISEL-NEXT: v_mul_hi_u32 v6, v14, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v14, v7 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v14, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -1926,10 +1926,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v12 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v12 +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_srem_v2i64_oddk_denom: @@ -1948,110 +1948,110 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0 ; CGP-NEXT: v_mov_b32_e32 v9, v5 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10] ; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10] +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11] ; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v13 +; CGP-NEXT: v_mul_hi_u32 v14, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 +; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0 +; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc ; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_mul_lo_u32 v9, v16, v14 +; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v19, v13 +; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15] +; CGP-NEXT: v_mul_lo_u32 v9, v18, v16 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v16, v13 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v18, v13 +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_hi_u32 v9, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v15, v16, v14 +; CGP-NEXT: v_mul_hi_u32 v9, v19, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v19, v16 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v14, v18, v16 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15 +; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9 ; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc -; CGP-NEXT: v_xor_b32_e32 v18, v0, v9 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v17, v14 -; CGP-NEXT: v_xor_b32_e32 v19, v1, v9 +; CGP-NEXT: v_xor_b32_e32 v15, v0, v9 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v19, v16 +; CGP-NEXT: v_xor_b32_e32 v17, v1, v9 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v19, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v18, v1 -; CGP-NEXT: v_mul_hi_u32 v15, v18, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v19, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v17, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 +; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v17, v0 ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v19, v1 +; CGP-NEXT: v_mul_lo_u32 v16, v17, v1 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v18, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v19, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0 -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13 -; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc +; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13 +; CGP-NEXT: v_mul_hi_u32 v18, v17, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0 +; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16 +; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2] +; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0 +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13 +; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4 -; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4 +; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4 +; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4 +; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc ; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1] -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4 -; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1] -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4 -; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1] +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14] +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4 +; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_mul_lo_u32 v5, v7, v0 -; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc +; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 @@ -2069,72 +2069,72 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v9 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v8, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v7, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc +; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2] +; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc +; CGP-NEXT: v_xor_b32_e32 v13, v5, v9 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8] +; CGP-NEXT: v_xor_b32_e32 v1, v12, v9 +; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_xor_b32_e32 v7, v2, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v3, v12 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v3, v11, v5 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v6, v10, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v11, v5 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc +; CGP-NEXT: v_mul_lo_u32 v5, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v6, v7, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CGP-NEXT: v_mul_hi_u32 v9, v7, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_mul_hi_u32 v6, v7, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v7, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v8, v2 +; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5 +; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4] +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4 ; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -2154,10 +2154,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, <i64 1235195, i64 1235195> ret <2 x i64> %result @@ -2193,130 +2193,130 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) { ; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1 ; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0 ; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1 -; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0 +; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc ; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v7, v5 ; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7 -; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7 -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2 +; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v7 +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6 -; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] +; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 +; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8 +; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7 -; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc -; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0 +; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v5, vcc +; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, v6 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3] -; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9 -; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7] -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc -; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9 -; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5 -; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6 -; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3] +; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v11 +; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7] +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v11, vcc +; CHECK-NEXT: v_xor_b32_e32 v9, v2, v11 +; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5 +; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8 +; CHECK-NEXT: v_xor_b32_e32 v12, v3, v11 +; CHECK-NEXT: v_mul_hi_u32 v3, v10, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6 +; CHECK-NEXT: v_mul_lo_u32 v3, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6 +; CHECK-NEXT: v_mul_hi_u32 v4, v10, v8 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6 +; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2 -; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc +; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3 +; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3 +; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3 +; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3 ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0 +; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4 +; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3 +; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4] -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4] -; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4 +; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4] +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5] +; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5] ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11 +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc ; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 ; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -2361,85 +2361,85 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_xor_b32_e32 v7, v10, v7 ; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5 ; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7 -; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v5 -; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc ; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 ; GISEL-NEXT: v_trunc_f32_e32 v12, v10 ; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v12 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v4, v16, v10 -; GISEL-NEXT: v_mul_hi_u32 v17, v13, v10 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v12, v13, v11 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; GISEL-NEXT: v_mul_lo_u32 v4, v18, v10 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v17, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GISEL-NEXT: v_mul_hi_u32 v12, v18, v13 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; GISEL-NEXT: v_mov_b32_e32 v4, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v16, v10 -; GISEL-NEXT: v_mul_lo_u32 v14, v13, v11 -; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v0, v18, v10 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4 +; GISEL-NEXT: v_mul_hi_u32 v1, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0 -; GISEL-NEXT: v_mul_hi_u32 v14, v13, v11 +; GISEL-NEXT: v_mul_lo_u32 v1, v18, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14 -; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v18, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc +; GISEL-NEXT: v_mul_lo_u32 v13, v16, v10 ; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6 ; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10 -; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11 +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10 @@ -2448,127 +2448,127 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6 -; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v8, v6 ; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc ; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0 ; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8 ; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8 -; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6 -; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8 +; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6 +; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v8 ; GISEL-NEXT: v_mov_b32_e32 v0, v10 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1] -; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6 -; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10 +; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v14, v[0:1] +; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v14, v1 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[10:11] +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v8, vcc +; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v14 ; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1 ; GISEL-NEXT: v_trunc_f32_e32 v13, v10 ; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1 -; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1 +; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13 ; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v15, 0 +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], v16, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10 -; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12] -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0 -; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7 +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v19, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v1, v19, v10 +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v0 +; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v20, v7 ; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 +; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13 +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v15, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v20, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v11, s[6:7] ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5 -; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc +; GISEL-NEXT: v_subbrev_u32_e64 v14, s[6:7], 0, v0, vcc ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v14, v7 ; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v14, v7 ; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11 +; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v19, v13 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 +; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v19, v13 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0 +; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v19, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 ; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v14, v[1:2] ; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4 ; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10] -; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v18, v13, v[9:10] +; GISEL-NEXT: v_cndmask_b32_e32 v7, v20, v7, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc ; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0 +; GISEL-NEXT: v_mul_lo_u32 v9, v13, v11 +; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9 +; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11 +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v13, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 -; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v14, v11 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0 +; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc +; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0 ; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2 ; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 ; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2 +; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 ; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 @@ -2577,19 +2577,19 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2 +; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v13, v[0:1] ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 ; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc ; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 @@ -2645,103 +2645,103 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v1, v4, v1 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 ; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1 -; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v0 -; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc +; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 +; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v12, v10 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12 -; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; CGP-NEXT: v_cvt_u32_f32_e32 v15, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v18, v12 +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_mul_hi_u32 v12, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v17, v13, v11 -; CGP-NEXT: v_mul_lo_u32 v18, v16, v11 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v17 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; CGP-NEXT: v_mul_hi_u32 v11, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v15, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v13, v11 +; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v15, v13 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v17, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v18, v10 -; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; CGP-NEXT: v_mul_hi_u32 v11, v16, v11 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v12, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v4 +; CGP-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc +; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v11 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14 -; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12] -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v4, v14 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v10 -; CGP-NEXT: v_mul_lo_u32 v9, v13, v11 -; CGP-NEXT: v_xor_b32_e32 v15, v8, v14 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v10 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v16 +; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12] +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v16, vcc +; CGP-NEXT: v_xor_b32_e32 v14, v4, v16 +; CGP-NEXT: v_mul_lo_u32 v4, v18, v10 +; CGP-NEXT: v_mul_lo_u32 v9, v15, v13 +; CGP-NEXT: v_xor_b32_e32 v17, v8, v16 +; CGP-NEXT: v_mul_hi_u32 v8, v15, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v18, v10 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v16, v11 +; CGP-NEXT: v_mul_lo_u32 v8, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 +; CGP-NEXT: v_mul_hi_u32 v9, v15, v13 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v16, v11 +; CGP-NEXT: v_mul_hi_u32 v10, v18, v13 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v15, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v12, v8 -; CGP-NEXT: v_mul_hi_u32 v11, v12, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v15, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v15, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v18, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v17, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v14, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v14, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v17, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v15, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v17, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v12, v8 +; CGP-NEXT: v_mul_hi_u32 v10, v14, v8 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v11, v17, v8 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v13, 0 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 ; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10] -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v11, v[4:5] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v14, v8 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v13, v[9:10] +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v17, v11, vcc +; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v11 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0 @@ -2754,11 +2754,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1 ; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0 -; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; CGP-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc @@ -2766,10 +2766,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v14 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v14 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v0, v16 +; CGP-NEXT: v_xor_b32_e32 v1, v1, v16 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc ; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12 ; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: .LBB8_2: ; %Flow1 @@ -2819,117 +2819,117 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_xor_b32_e32 v3, v4, v3 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 ; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 ; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] ; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_mul_hi_u32 v10, v11, v8 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_mul_hi_u32 v9, v13, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 -; CGP-NEXT: v_mul_lo_u32 v15, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v13, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v6, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v13, v11 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v10, v6, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0 ; CGP-NEXT: v_mov_b32_e32 v4, v9 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12 -; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10] -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc -; CGP-NEXT: v_xor_b32_e32 v7, v4, v12 +; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5] +; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v14 +; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10] +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v14, vcc +; CGP-NEXT: v_xor_b32_e32 v10, v4, v14 ; CGP-NEXT: v_mul_lo_u32 v4, v6, v8 -; CGP-NEXT: v_mul_lo_u32 v10, v11, v9 -; CGP-NEXT: v_xor_b32_e32 v13, v5, v12 -; CGP-NEXT: v_mul_hi_u32 v5, v11, v8 +; CGP-NEXT: v_mul_lo_u32 v7, v13, v11 +; CGP-NEXT: v_xor_b32_e32 v12, v5, v14 +; CGP-NEXT: v_mul_hi_u32 v5, v13, v8 ; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v6, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v9 +; CGP-NEXT: v_mul_lo_u32 v5, v6, v11 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v13, v11 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v9, v6, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_mul_hi_u32 v8, v6, v11 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v13, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v5 +; CGP-NEXT: v_mul_lo_u32 v6, v12, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v8, v10, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v12, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v13, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_mul_hi_u32 v8, v7, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_mul_lo_u32 v8, v12, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_mul_hi_u32 v7, v10, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6 +; CGP-NEXT: v_mul_hi_u32 v8, v12, v5 +; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6] -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6] -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6 +; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6] +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[6:7] +; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v8 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc ; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc @@ -2938,11 +2938,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) { ; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v12 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v12 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v2, v14 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v14 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc ; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] @@ -3004,15 +3004,15 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) { ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3 -; CGP-NEXT: v_mul_lo_u32 v1, v1, v4 -; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v1, v4 +; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0 ; CGP-NEXT: v_mul_lo_u32 v0, v1, v3 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v3 @@ -3035,196 +3035,196 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 -; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 -; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7 +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc ; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GISEL-NEXT: v_trunc_f32_e32 v5, v4 ; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4 -; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5] -; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5] -; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0 -; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3 -; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4 -; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5] +; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0 +; GISEL-NEXT: v_mul_lo_u32 v0, v13, v3 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8] +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3 +; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3 +; GISEL-NEXT: v_mul_lo_u32 v5, v10, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v13, v3, vcc ; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0 -; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8 +; GISEL-NEXT: v_mul_lo_u32 v5, v11, v7 ; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0 ; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8 +; GISEL-NEXT: v_mul_lo_u32 v5, 0, v7 ; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8 +; GISEL-NEXT: v_mul_hi_u32 v6, v11, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v4 ; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8 -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v12, 0 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7 +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1] -; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v7, v[0:1] +; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v8 ; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v11, v7 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0 -; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3 -; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6] +; GISEL-NEXT: v_trunc_f32_e32 v9, v7 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v0 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v3 +; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v9 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v13, 0 +; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], 0, v12, v[5:6] ; GISEL-NEXT: v_mov_b32_e32 v0, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5 -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7 -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v16, v[0:1] +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v11, v4 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v4, v16, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v10 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v9, vcc +; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v9 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7 +; GISEL-NEXT: v_mul_lo_u32 v8, v16, v10 +; GISEL-NEXT: v_mul_hi_u32 v7, v16, v7 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_hi_u32 v5, v13, v10 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 +; GISEL-NEXT: v_mul_hi_u32 v8, v16, v10 ; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4 -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v13, v4 +; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], v16, v5, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, 0 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v0, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v6, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v10, v[0:1] +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v1 +; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v15, v9, v[5:6] ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v14, -1, v5, vcc +; GISEL-NEXT: v_mul_hi_u32 v5, v9, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v4, v10, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v10, v5, vcc ; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v2, v6 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7 +; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v13, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_mul_lo_u32 v7, 0, v6 ; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v2, v6 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v4, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v7, 0, v6 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc ; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v8, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v12, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v9, v[5:6] ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5 +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v7 +; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v7, vcc ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3 ; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc @@ -3264,15 +3264,15 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 +; CGP-NEXT: v_mul_lo_u32 v5, v0, v6 +; CGP-NEXT: v_mul_lo_u32 v7, v1, v3 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v7 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v1 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0 ; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 ; CGP-NEXT: v_mul_lo_u32 v6, v1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 9e412b6c7cd0a..23ef596c021c2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -132,65 +132,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -203,54 +202,55 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2] -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s8, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v2, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v0 +; GFX8-NEXT: v_subb_u32_e64 v5, s[0:1], v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v7 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v6 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s10, v3 ; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4 -; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] +; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6 +; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 -; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v4 +; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2 +; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s10, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 -; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc +; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v15, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v14, vcc ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm @@ -271,63 +271,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 -; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s17, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 @@ -339,53 +340,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v6, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s16, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2] +; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s18, v7, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s19, v6, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_mov_b32_e32 v4, s19 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v2, v1, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v8 -; GFX9-NEXT: v_sub_u32_e32 v0, s17, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v7 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s16, v0 +; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v2, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v5 +; GFX9-NEXT: v_sub_u32_e32 v0, s17, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v8 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s18, v7 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v5 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v9 -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v2 +; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s18, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[0:1] -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[12:13] -; GFX9-NEXT: global_store_dwordx2 v6, v[2:3], s[14:15] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v14, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] +; GFX9-NEXT: global_store_dwordx2 v8, v[0:1], s[12:13] +; GFX9-NEXT: global_store_dwordx2 v8, v[2:3], s[14:15] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i64: @@ -1005,72 +1005,72 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v10, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX8-NEXT: v_trunc_f32_e32 v2, v1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX8-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0 ; GFX8-NEXT: s_sub_u32 s2, 0, s14 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX8-NEXT: s_subb_u32 s3, 0, s15 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5 -; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 ; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0 ; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX8-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 @@ -1083,136 +1083,136 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v2 -; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, v[1:2] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s13, v7, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v4, v2 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[1:2] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s8, v0 -; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v2 +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v4 ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] ; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s15 ; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s14 -; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v5, vcc +; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v2, v10, vcc ; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 ; GFX8-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v1 -; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v6, vcc +; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, s12, v1 +; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc ; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX8-NEXT: v_trunc_f32_e32 v4, v3 ; GFX8-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 ; GFX8-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v7 -; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v5, vcc -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX8-NEXT: v_mul_lo_u32 v17, v12, v3 -; GFX8-NEXT: v_mul_hi_u32 v6, v12, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v2 +; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v8 +; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v9, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v18, v4 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] +; GFX8-NEXT: v_mul_lo_u32 v4, v18, v2 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v12, v10, vcc +; GFX8-NEXT: v_mul_lo_u32 v5, v15, v6 +; GFX8-NEXT: v_mul_hi_u32 v10, v15, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, v18, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v14 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v15, v3 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v17, v4 -; GFX8-NEXT: v_mul_hi_u32 v17, v12, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v17 -; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v17 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10 -; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v5, vcc +; GFX8-NEXT: v_mul_lo_u32 v10, v18, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_mul_hi_u32 v5, v15, v6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v16 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v13 +; GFX8-NEXT: v_mul_hi_u32 v6, v18, v6 +; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v7, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v2 -; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v3, vcc -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v2 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, v18, v4, vcc +; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v10, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, v5 -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1] -; GFX8-NEXT: v_mul_lo_u32 v7, v15, v4 -; GFX8-NEXT: v_mul_lo_u32 v8, v12, v5 -; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[2:3], s3, v15, v[5:6] +; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v12, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, v18, v4 +; GFX8-NEXT: v_mul_lo_u32 v8, v15, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v10, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v9, v15, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v19, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, v15, v5 -; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_mul_hi_u32 v8, v12, v5 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_mul_lo_u32 v9, v18, v7 +; GFX8-NEXT: v_mul_hi_u32 v4, v18, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_mul_hi_u32 v8, v15, v7 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v9, v8 -; GFX8-NEXT: v_mul_hi_u32 v5, v15, v5 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4 -; GFX8-NEXT: v_mul_lo_u32 v8, s10, v5 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1] +; GFX8-NEXT: v_mul_hi_u32 v7, v18, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v18, v6, vcc +; GFX8-NEXT: v_mul_lo_u32 v8, s11, v4 +; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1] ; GFX8-NEXT: v_mul_hi_u32 v1, s10, v4 ; GFX8-NEXT: v_mul_hi_u32 v4, s11, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v7, s11, v5 +; GFX8-NEXT: v_mul_lo_u32 v5, s11, v7 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, s10, v5 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_mul_hi_u32 v8, s10, v7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v5, v8 ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v1 -; GFX8-NEXT: v_mul_hi_u32 v8, s11, v5 +; GFX8-NEXT: v_mul_hi_u32 v7, s11, v7 ; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 -; GFX8-NEXT: v_add_u32_e32 v12, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v7, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v0, v10, s[0:1] @@ -1279,60 +1279,61 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_trunc_f32_e32 v2, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1 -; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1 -; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 -; GFX9-NEXT: v_add_u32_e32 v2, v6, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2] -; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0 ; GFX9-NEXT: s_sub_u32 s2, 0, s6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2] -; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1 +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3] +; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4 ; GFX9-NEXT: s_subb_u32 s3, 0, s7 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 @@ -1349,114 +1350,113 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v0, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: v_add3_u32 v10, v3, v2, v5 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2] ; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v3, v2, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v9, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1] ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v8, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1 -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v13, vcc, s4, v1 +; GFX9-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v12, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v4, v3 ; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 ; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2 -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7 -; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1] -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v5, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1] -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2 -; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v12, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v2 +; GFX9-NEXT: v_add_co_u32_e64 v16, s[0:1], 1, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v17, s[0:1], 0, v10, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v4 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v13 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5] +; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v12, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, v15, v6 +; GFX9-NEXT: v_mul_hi_u32 v8, v15, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, v18, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v14 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v15, v3 -; GFX9-NEXT: v_add_u32_e32 v4, v17, v4 -; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 +; GFX9-NEXT: v_mul_lo_u32 v8, v18, v6 +; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 +; GFX9-NEXT: v_mul_hi_u32 v5, v15, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, v18, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v5, v5, v17 -; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc +; GFX9-NEXT: v_add_u32_e32 v5, v8, v5 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v17, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2 -; GFX9-NEXT: v_add3_u32 v3, v5, v4, v3 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc +; GFX9-NEXT: v_add3_u32 v4, v5, v4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, v15, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, v18, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0 +; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v13 ; GFX9-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3] -; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10 -; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s3, v12, v[2:3] -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v6, v15, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, v12, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v12, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v18, vcc -; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v6, s[2:3], v6, v9 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3] +; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v8, vcc +; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s3, v15, v[5:6] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 +; GFX9-NEXT: v_mul_lo_u32 v5, v18, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v15, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[0:1] +; GFX9-NEXT: v_mul_hi_u32 v9, v15, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v18, v4 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v9, v15, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 -; GFX9-NEXT: v_mul_hi_u32 v7, v12, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[2:3], v5, v9 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] +; GFX9-NEXT: v_mul_lo_u32 v9, v18, v7 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, v15, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, v18, v7 ; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v9, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[2:3] ; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v6 -; GFX9-NEXT: v_add_u32_e32 v7, v9, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] -; GFX9-NEXT: v_add3_u32 v5, v7, v6, v5 -; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v12, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v15, v5, s[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v4, v5 +; GFX9-NEXT: v_add_u32_e32 v6, v9, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[2:3] +; GFX9-NEXT: v_add3_u32 v5, v6, v5, v7 +; GFX9-NEXT: v_add_co_u32_e64 v4, s[2:3], v15, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[2:3], v18, v5, s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v6, s19, v4 ; GFX9-NEXT: v_mul_lo_u32 v7, s18, v5 ; GFX9-NEXT: v_mul_hi_u32 v9, s18, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v14, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v20, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v13, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v20, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll new file mode 100644 index 0000000000000..d7d623ac89146 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s + +; Make sure we do not infer anything about implicit inputs through an +; intrinsic call which is not nocallback. + +declare zeroext i32 @return_i32() + +define i32 @test_i32_return() gc "statepoint-example" { +; CHECK-LABEL: define i32 @test_i32_return( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] gc "statepoint-example" { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: [[CALL1:%.*]] = call zeroext i32 @llvm.experimental.gc.result.i32(token [[SAFEPOINT_TOKEN]]) +; CHECK-NEXT: ret i32 [[CALL1]] +; +entry: + %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0) + %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token) + ret i32 %call1 +} + +declare token @llvm.experimental.gc.statepoint.p0(i64 immarg, i32 immarg, ptr, i32 immarg, i32 immarg, ...) +declare i32 @llvm.experimental.gc.result.i32(token) #0 + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } +;. +; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll new file mode 100644 index 0000000000000..71c509afa8e64 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-attributor -mcpu=gfx90a %s | FileCheck %s + +; Make sure we infer no inputs are used through some intrinsics + +define void @use_fake_use(i32 %arg) { +; CHECK-LABEL: define void @use_fake_use( +; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ARG]]) +; CHECK-NEXT: ret void +; + call void (...) @llvm.fake.use(i32 %arg) + ret void +} + +define void @use_donothing() { +; CHECK-LABEL: define void @use_donothing( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: ret void +; + call void @llvm.donothing() + ret void +} + +define void @use_assume(i1 %arg) { +; CHECK-LABEL: define void @use_assume( +; CHECK-SAME: i1 [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.assume(i1 [[ARG]]) +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 %arg) + ret void +} + +define void @use_trap() { +; CHECK-LABEL: define void @use_trap( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void @llvm.trap() +; CHECK-NEXT: ret void +; + call void @llvm.trap() + ret void +} + +define void @use_debugtrap() { +; CHECK-LABEL: define void @use_debugtrap( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.debugtrap() +; CHECK-NEXT: ret void +; + call void @llvm.debugtrap() + ret void +} + +define void @use_ubsantrap() { +; CHECK-LABEL: define void @use_ubsantrap( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.ubsantrap(i8 0) +; CHECK-NEXT: ret void +; + call void @llvm.ubsantrap(i8 0) + ret void +} + +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll new file mode 100644 index 0000000000000..06150e4277e9a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll @@ -0,0 +1,519 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=GFX942 %s + +; These situations are "special" in that they either have an alloca that is not +; in the entry block or that they have a dynamic alloca. Both situations affect +; prolog/epilog generation. + +declare amdgpu_gfx void @foo() + +define amdgpu_cs_chain void @test_alloca() { +; GFX12-LABEL: test_alloca: +; GFX12: ; %bb.0: ; %.entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s0, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s0, 0x200 +; GFX12-NEXT: scratch_store_b32 off, v0, s0 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca: +; GFX942: ; %bb.0: ; %.entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s0, s32 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_add_i32 s32, s0, 0x400 +; GFX942-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NEXT: s_endpgm +.entry: + br label %SW_C + +SW_C: ; preds = %.entry + %v = alloca i32, i32 1, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_alloca_var_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s0, s0, 15 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_b32 s0, s0, -16 +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_lshl_b32 s0, s0, 5 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s1, s0 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_var_uniform: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: s_add_i32 s0, s0, 15 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_and_b32 s0, s0, -16 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_lshl_b32 s0, s0, 6 +; GFX942-NEXT: s_mov_b32 s1, s32 +; GFX942-NEXT: s_add_i32 s32, s1, s0 +; GFX942-NEXT: scratch_store_dword off, v0, s1 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_alloca_var(i32 %count) { +; GFX12-LABEL: test_alloca_var: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0 +; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_ctz_i32_b32 s2, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12-NEXT: s_bitset0_b32 s1, s2 +; GFX12-NEXT: s_max_u32 s0, s0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_lshl_add_u32 v1, s0, 5, s1 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: v_readfirstlane_b32 s32, v1 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_var: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT: v_and_b32_e32 v1, -16, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT: v_readlane_b32 s4, v1, s3 +; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT: s_max_u32 s2, s2, s4 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: s_cbranch_scc1 .LBB2_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_mov_b32 s0, s32 +; GFX942-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1 +; GFX942-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NEXT: v_readfirstlane_b32 s32, v1 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call() { +; GFX12-LABEL: test_alloca_and_call: +; GFX12: ; %bb.0: ; %.entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_mov_b32 s2, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s2, 0x200 +; GFX12-NEXT: scratch_store_b32 off, v0, s2 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_and_call: +; GFX942: ; %bb.0: ; %.entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s2, s32 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_add_i32 s32, s2, 0x400 +; GFX942-NEXT: scratch_store_dword off, v0, s2 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: s_endpgm +.entry: + br label %SW_C + +SW_C: ; preds = %.entry + %v = alloca i32, i32 1, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + call amdgpu_gfx void @foo() + ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_alloca_and_call_var_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, 15 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_b32 s0, s0, -16 +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_lshl_b32 s0, s0, 5 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s1, s0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_and_call_var_uniform: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: s_add_i32 s0, s0, 15 +; GFX942-NEXT: s_and_b32 s0, s0, -16 +; GFX942-NEXT: s_lshl_b32 s2, s0, 6 +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b32 s3, s32 +; GFX942-NEXT: s_add_i32 s32, s3, s2 +; GFX942-NEXT: scratch_store_dword off, v0, s3 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + call amdgpu_gfx void @foo() + ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) { +; GFX12-LABEL: test_alloca_and_call_var: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0 +; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_ctz_i32_b32 s2, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s3, v1, s2 +; GFX12-NEXT: s_bitset0_b32 s1, s2 +; GFX12-NEXT: s_max_u32 s0, s0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_mov_b32 s1, s32 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_lshl_add_u32 v1, s0, 5, s1 +; GFX12-NEXT: scratch_store_b32 off, v0, s1 +; GFX12-NEXT: v_readfirstlane_b32 s32, v1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_alloca_and_call_var: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT: v_and_b32_e32 v1, -16, v0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT: v_readlane_b32 s4, v1, s3 +; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT: s_max_u32 s2, s2, s4 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s3, s32 +; GFX942-NEXT: v_mov_b32_e32 v1, s3 +; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1 +; GFX942-NEXT: scratch_store_dword off, v0, s3 +; GFX942-NEXT: v_readfirstlane_b32 s32, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + store i32 0, ptr addrspace(5) %v, align 4 + call amdgpu_gfx void @foo() + ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca() { +; GFX12-LABEL: test_call_and_alloca: +; GFX12: ; %bb.0: ; %.entry +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s1, s1 +; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT: s_mov_b32 s4, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s4, 0x200 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: scratch_store_b32 off, v0, s4 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_call_and_alloca: +; GFX942: ; %bb.0: ; %.entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s4, s32 +; GFX942-NEXT: s_add_i32 s32, s4, 0x400 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: scratch_store_dword off, v0, s4 +; GFX942-NEXT: s_endpgm +.entry: + br label %SW_C + +SW_C: ; preds = %.entry + %v = alloca i32, i32 1, align 4, addrspace(5) + call amdgpu_gfx void @foo() + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_call_and_alloca_var_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_lshl_b32 s0, s0, 2 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: s_add_co_i32 s0, s0, 15 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_b32 s0, s0, -16 +; GFX12-NEXT: s_mov_b32 s4, s32 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_lshl_b32 s0, s0, 5 +; GFX12-NEXT: v_mov_b32_e32 v40, 0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s32, s4, s0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: scratch_store_b32 off, v40, s4 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_call_and_alloca_var_uniform: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 2 +; GFX942-NEXT: s_add_i32 s0, s0, 15 +; GFX942-NEXT: s_and_b32 s0, s0, -16 +; GFX942-NEXT: s_lshl_b32 s2, s0, 6 +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: s_mov_b32 s4, s32 +; GFX942-NEXT: v_mov_b32_e32 v40, 0 +; GFX942-NEXT: s_add_i32 s32, s4, s2 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: scratch_store_dword off, v40, s4 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + call amdgpu_gfx void @foo() + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) { +; GFX12-LABEL: test_call_and_alloca_var: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT: v_mov_b32_e32 v40, 0 +; GFX12-NEXT: s_mov_b32 s1, exec_lo +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_mov_b32 s32, 16 +; GFX12-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_ctz_i32_b32 s2, s1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s3, v0, s2 +; GFX12-NEXT: s_bitset0_b32 s1, s2 +; GFX12-NEXT: s_max_u32 s0, s0, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_cmp_lg_u32 s1, 0 +; GFX12-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: s_getpc_b64 s[2:3] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: s_mov_b32 s4, s32 +; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, s0, 5, s4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readfirstlane_b32 s32, v0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT: scratch_store_b32 off, v40, s4 +; GFX12-NEXT: s_endpgm +; +; GFX942-LABEL: test_call_and_alloca_var: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX942-NEXT: v_mov_b32_e32 v40, 0 +; GFX942-NEXT: s_mov_b64 s[0:1], exec +; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: s_mov_b32 s32, 16 +; GFX942-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT: v_readlane_b32 s4, v0, s3 +; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT: s_max_u32 s2, s2, s4 +; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT: s_cbranch_scc1 .LBB8_1 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT: s_mov_b32 s4, s32 +; GFX942-NEXT: v_mov_b32_e32 v0, s4 +; GFX942-NEXT: v_lshl_add_u32 v0, s2, 6, v0 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readfirstlane_b32 s32, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT: scratch_store_dword off, v40, s4 +; GFX942-NEXT: s_endpgm + %v = alloca i32, i32 %count, align 4, addrspace(5) + call amdgpu_gfx void @foo() + store i32 0, ptr addrspace(5) %v, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll index f6ae516faf2f7..89d039406f375 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll @@ -1489,7 +1489,7 @@ attributes #2 = { noinline } !0 = !{float 3.0} ;. ; CHECK: attributes #[[ATTR0]] = { strictfp } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) } ; CHECK: attributes #[[ATTR3]] = { noinline } ; CHECK: attributes #[[ATTR4]] = { nobuiltin } diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll new file mode 100644 index 0000000000000..253a6ec100eae --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/callbr.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s + +define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) { +; CHECK-LABEL: callbr_inline_asm: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_dword v0, v[0:1] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.1: ; %fallthrough +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[2:3], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target +; CHECK-NEXT: ; %indirect +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dword v[4:5], v0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %a = load i32, ptr %src, align 4 + callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect] +fallthrough: + store i32 %a, ptr %dst1, align 4 + br label %ret +indirect: + store i32 %a, ptr %dst2, align 4 + br label %ret +ret: + ret void +} + +define void @callbr_self_loop(i1 %c) { +; CHECK-LABEL: callbr_self_loop: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: .LBB1_1: ; %callbr +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_branch .LBB1_1 +; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target +; CHECK-NEXT: ; %callbr.target.ret +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_setpc_b64 s[30:31] + br label %callbr +callbr: + callbr void asm "", "!i"() to label %callbr [label %ret] +ret: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 0fc54aeaef77b..26f77898faf60 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -2407,51 +2407,52 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19 -; GISEL-NEXT: v_mul_lo_u32 v25, v29, v18 +; GISEL-NEXT: v_mul_lo_u32 v27, v30, v19 +; GISEL-NEXT: v_mul_lo_u32 v36, v29, v18 ; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0 ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0 -; GISEL-NEXT: v_mul_lo_u32 v26, v35, v3 -; GISEL-NEXT: v_mul_lo_u32 v27, v34, v2 +; GISEL-NEXT: v_mul_lo_u32 v37, v35, v3 +; GISEL-NEXT: v_mul_lo_u32 v38, v34, v2 ; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15] ; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23] ; GISEL-NEXT: v_mov_b32_e32 v22, v19 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3] -; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15] -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2] -; GISEL-NEXT: v_mov_b32_e32 v23, v14 -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2] -; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23] -; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7] -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v25, vcc +; GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v10, v31, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v20, v[14:15] +; GISEL-NEXT: v_mov_b32_e32 v2, v23 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2] +; GISEL-NEXT: v_mov_b32_e32 v23, v25 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v29, v31, v[14:15] +; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v24, v27, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2] +; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v26, v37, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v36, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v17, v1, vcc -; GISEL-NEXT: v_xor_b32_e32 v15, v0, v28 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v18 -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5] -; GISEL-NEXT: v_xor_b32_e32 v16, v12, v33 -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4] -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1] -; GISEL-NEXT: v_xor_b32_e32 v14, v14, v33 -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13] -; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v15, v28 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v22, vcc +; GISEL-NEXT: v_xor_b32_e32 v19, v0, v28 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v38, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v18 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v18, v2, v33 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2] +; GISEL-NEXT: v_xor_b32_e32 v1, v16, v28 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v10, v14, v33 +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v19, v28 ; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4] -; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v16, v33 -; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9] -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc -; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v10, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 -; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v23, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13] +; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v18, v33 +; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9] +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v23, vcc +; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v16, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v15, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc ; GISEL-NEXT: v_xor_b32_e32 v6, v6, v33 ; GISEL-NEXT: v_xor_b32_e32 v7, v8, v28 -; GISEL-NEXT: v_xor_b32_e32 v8, v3, v33 -; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v2, v28, s[6:7] +; GISEL-NEXT: v_xor_b32_e32 v8, v2, v33 +; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v3, v28, s[6:7] ; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc ; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9] ; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc @@ -3216,36 +3217,38 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0 ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0 -; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21 -; GISEL-NEXT: v_mul_lo_u32 v29, v9, v20 +; GISEL-NEXT: v_mul_lo_u32 v34, v8, v21 +; GISEL-NEXT: v_mul_lo_u32 v35, v9, v20 ; GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0 ; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0 -; GISEL-NEXT: v_mul_lo_u32 v30, v12, v19 -; GISEL-NEXT: v_mul_lo_u32 v31, v13, v18 +; GISEL-NEXT: v_mul_lo_u32 v36, v12, v19 +; GISEL-NEXT: v_mul_lo_u32 v37, v13, v18 ; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23] ; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27] -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19] -; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23] -; GISEL-NEXT: v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18] -; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22] -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18] -; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22] -; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7] -; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v29, vcc +; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v10, v32, v[18:19] +; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v14, v24, v[22:23] +; GISEL-NEXT: v_mov_b32_e32 v18, v26 +; GISEL-NEXT: v_mad_u64_u32 v[30:31], vcc, v8, v33, v[17:18] +; GISEL-NEXT: v_mov_b32_e32 v22, v28 +; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22] +; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[30:31] +; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v27, v34, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18] +; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v29, v36, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v35, vcc ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc -; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v21, vcc +; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v13, v37, s[4:5] ; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18] -; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17] -; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19] -; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc -; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v13, s[4:5] -; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v8, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[13:14] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v11, v32, v[16:17] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v15, v24, v[18:19] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v22, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v9, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = urem <2 x i128> %lhs, %rhs ret <2 x i128> %shl diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll index 007e3f0a6bdbc..076a99ff8588f 100644 --- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll +++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll @@ -3,6 +3,7 @@ declare void @foo(ptr) declare i1 @bar(ptr) +declare i32 @bar32(ptr) define void @musttail_call_without_return_value(ptr %p) { ; CHECK-LABEL: define void @musttail_call_without_return_value( @@ -28,6 +29,31 @@ bb.1: ret void } +define void @musttail_call_without_return_value_callbr(ptr %p) { +; CHECK-LABEL: define void @musttail_call_without_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: musttail call void @foo(ptr [[P]]) +; CHECK-NEXT: ret void +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret void +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + musttail call void @foo(ptr %p) + ret void + +bb.1: + ret void +} + define i1 @musttail_call_with_return_value(ptr %p) { ; CHECK-LABEL: define i1 @musttail_call_with_return_value( ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { @@ -51,3 +77,28 @@ bb.0: bb.1: ret i1 %load } + +define i32 @musttail_call_with_return_value_callbr(ptr %p) { +; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1] +; CHECK: [[BB_0]]: +; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]]) +; CHECK-NEXT: ret i32 [[RET]] +; CHECK: [[BB_1:.*:]] +; CHECK-NEXT: ret i32 [[LOAD]] +; +entry: + %load = load i32, ptr %p, align 1 + callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: + %ret = musttail call i32 @bar32(ptr %p) + ret i32 %ret + +bb.1: + ret i32 %load +} diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir index 0548bcf304c32..279f4298e6418 100644 --- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir +++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir @@ -34,3 +34,16 @@ body: | $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec dead $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec ... + +--- +name: test_tied +body: | + bb.0: + ; CHECK-LABEL: name: test_tied + ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32(tied-def 1), implicit $mode, implicit $exec { + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32 + ; CHECK-NEXT: [[V_FMAC_F16_e32_:%[0-9]+]]:vgpr_32 = V_FMAC_F16_e32 internal [[COPY]], internal [[COPY]], %1:vgpr_32, implicit $mode, implicit $exec + ; CHECK-NEXT: } + %1:vgpr_32 = COPY %0:vgpr_32 + %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index e0421575c3174..460f1211d1386 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -237,31 +237,31 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 @@ -275,17 +275,18 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] -; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v4 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB0_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: .LBB0_7: ; %Flow2 @@ -604,31 +605,31 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr9 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr8 @@ -642,17 +643,18 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] -; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v4 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc ; GISEL-NEXT: .LBB1_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] ; GISEL-NEXT: .LBB1_7: ; %Flow2 @@ -962,31 +964,31 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 @@ -999,12 +1001,14 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 +; GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: .LBB2_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB2_7: ; %Flow2 @@ -1314,31 +1318,31 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr8 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6 ; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: ; implicit-def: $vgpr9 @@ -1351,12 +1355,14 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8 +; GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc ; GISEL-NEXT: .LBB3_6: ; %Flow1 ; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL-NEXT: .LBB3_7: ; %Flow2 @@ -1702,31 +1708,31 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB6_4: ; %Flow @@ -2050,31 +2056,31 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0 ; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7] ; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7] -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 -; GISEL-NEXT: v_mov_b32_e32 v2, v6 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9 -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] -; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] -; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11] ; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr5 -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] ; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GISEL-NEXT: ; implicit-def: $vgpr8 ; GISEL-NEXT: .LBB7_4: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 3e2e43faca5aa..df635925b87df 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -36,26 +36,60 @@ loop: br label %loop } +define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_callbr( +; IR-NEXT: entry: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP:%.*]] [] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +entry: + callbr void asm "", ""() to label %loop [] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] +} + define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB1_3 +; SI-NEXT: s_cbranch_execz .LBB2_3 ; SI-NEXT: ; %bb.1: ; %loop.preheader ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB1_2: ; %loop +; SI-NEXT: .LBB2_2: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB1_2 -; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock +; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_ret( ; IR-NEXT: entry: @@ -81,44 +115,93 @@ return: ret void } +define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_ret_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop.preheader +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: .LBB3_2: ; Inline asm indirect target +; SI-NEXT: ; %UnifiedReturnBlock +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_ret_callbr( +; IR-NEXT: entry: +; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1 +; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32 +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]]) +; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock] +; IR: loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP]] [] +; IR: UnifiedReturnBlock: +; IR-NEXT: ret void +; +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %cond = icmp eq i32 %tmp, 1 + %cond32 = zext i1 %cond to i32 + callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return] + +loop: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop [] + +return: + ret void +} + define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loops: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; SI-NEXT: s_mov_b64 s[2:3], -1 -; SI-NEXT: s_cbranch_scc1 .LBB2_4 +; SI-NEXT: s_cbranch_scc1 .LBB4_4 ; SI-NEXT: ; %bb.1: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x378 ; SI-NEXT: s_and_b64 vcc, exec, -1 -; SI-NEXT: .LBB2_2: ; %loop2 +; SI-NEXT: .LBB4_2: ; %loop2 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccnz .LBB2_2 +; SI-NEXT: s_cbranch_vccnz .LBB4_2 ; SI-NEXT: ; %bb.3: ; %Flow ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB2_4: ; %Flow2 +; SI-NEXT: .LBB4_4: ; %Flow2 ; SI-NEXT: s_and_b64 vcc, exec, s[2:3] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_7 +; SI-NEXT: s_cbranch_vccz .LBB4_7 ; SI-NEXT: ; %bb.5: ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_and_b64 vcc, exec, 0 -; SI-NEXT: .LBB2_6: ; %loop1 +; SI-NEXT: .LBB4_6: ; %loop1 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc -; SI-NEXT: s_cbranch_vccz .LBB2_6 -; SI-NEXT: .LBB2_7: ; %DummyReturnBlock +; SI-NEXT: s_cbranch_vccz .LBB4_6 +; SI-NEXT: .LBB4_7: ; %DummyReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loops( ; IR-NEXT: entry: @@ -144,24 +227,78 @@ loop2: br label %loop2 } +define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loops_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %loop1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB5_2: ; Inline asm indirect target +; SI-NEXT: ; %loop2.preheader +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x378 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loops_callbr( +; IR-NEXT: entry: +; IR-NEXT: callbr void asm "", "r,!i"(i32 poison) +; IR-NEXT: to label [[LOOP1:%.*]] [label %loop2] +; IR: loop1: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP1]] [] +; IR: loop2: +; IR-NEXT: store volatile i32 888, ptr addrspace(1) [[OUT]], align 4 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]] +; IR: TransitionBlock1: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[LOOP2:%.*]] [] +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +entry: + callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2] + +loop1: + store volatile i32 999, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop1 [] + +loop2: + store volatile i32 888, ptr addrspace(1) %out, align 4 + callbr void asm "", ""() to label %loop2 [] +} + define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-LABEL: infinite_loop_nest_ret: ; SI: ; %bb.0: ; %entry ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT: s_cbranch_execz .LBB3_5 +; SI-NEXT: s_cbranch_execz .LBB6_5 ; SI-NEXT: ; %bb.1: ; %outer_loop.preheader ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 -; SI-NEXT: .LBB3_2: ; %outer_loop +; SI-NEXT: .LBB6_2: ; %outer_loop ; SI-NEXT: ; =>This Loop Header: Depth=1 -; SI-NEXT: ; Child Loop BB3_3 Depth 2 +; SI-NEXT: ; Child Loop BB6_3 Depth 2 ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: .LBB3_3: ; %inner_loop -; SI-NEXT: ; Parent Loop BB3_2 Depth=1 +; SI-NEXT: .LBB6_3: ; %inner_loop +; SI-NEXT: ; Parent Loop BB6_2 Depth=1 ; SI-NEXT: ; => This Inner Loop Header: Depth=2 ; SI-NEXT: s_and_b64 s[8:9], exec, s[0:1] ; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] @@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) { ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT: s_cbranch_execnz .LBB3_3 +; SI-NEXT: s_cbranch_execnz .LBB6_3 ; SI-NEXT: ; %bb.4: ; %loop.exit.guard -; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1 +; SI-NEXT: ; in Loop: Header=BB6_2 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 vcc, 0 -; SI-NEXT: s_branch .LBB3_2 -; SI-NEXT: .LBB3_5: ; %UnifiedReturnBlock +; SI-NEXT: s_branch .LBB6_2 +; SI-NEXT: .LBB6_5: ; %UnifiedReturnBlock ; SI-NEXT: s_endpgm ; IR-LABEL: @infinite_loop_nest_ret( ; IR-NEXT: entry: @@ -212,4 +349,82 @@ return: ret void } +define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_nest_ret_callbr: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: ; %bb.1: ; %outer_loop.preheader +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT: s_and_b64 s[0:1], exec, 0 +; SI-NEXT: s_branch .LBB7_3 +; SI-NEXT: .LBB7_2: ; %loop.exit.guard +; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccnz .LBB7_5 +; SI-NEXT: .LBB7_3: ; %outer_loop +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: ;;#ASMSTART +; SI-NEXT: ;;#ASMEND +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: s_mov_b64 vcc, s[0:1] +; SI-NEXT: s_cbranch_vccz .LBB7_2 +; SI-NEXT: ; %bb.4: ; %TransitionBlock.target.outer_loop +; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_branch .LBB7_2 +; SI-NEXT: .LBB7_5: ; Inline asm indirect target +; SI-NEXT: ; %UnifiedReturnBlock +; SI-NEXT: ; Label of block must be emitted +; SI-NEXT: s_endpgm +; IR-LABEL: @infinite_loop_nest_ret_callbr( +; IR-NEXT: entry: +; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[COND1:%.*]] = icmp ne i32 [[TMP]], 1 +; IR-NEXT: [[COND1_32:%.*]] = zext i1 [[COND1]] to i32 +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND1_32]]) +; IR-NEXT: to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock] +; IR: outer_loop: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[INNER_LOOP:%.*]] [] +; IR: inner_loop: +; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT: [[COND3:%.*]] = icmp eq i32 [[TMP]], 3 +; IR-NEXT: [[COND3_32:%.*]] = zext i1 [[COND3]] to i32 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND3_32]]) +; IR-NEXT: to label [[INNER_LOOP]] [label %outer_loop] +; IR: UnifiedReturnBlock: +; IR-NEXT: ret void +; +entry: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination + %cond1_32 = zext i1 %cond1 to i32 + callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return] + +outer_loop: + ; %cond2 = icmp eq i32 %tmp, 2 + ; br i1 %cond2, label %outer_loop, label %inner_loop + callbr void asm "", ""() to label %inner_loop [] + +inner_loop: ; preds = %LeafBlock, %LeafBlock1 + store volatile i32 999, ptr addrspace(1) %out, align 4 + %cond3 = icmp eq i32 %tmp, 3 + %cond3_32 = zext i1 %cond3 to i32 + callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop] + +return: + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 31b6b533866d4..f705a2ffc4f1d 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5775,28 +5775,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX7-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v8 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5831,28 +5831,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX8-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, v0, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v5 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, v0, v8 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v4 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5883,28 +5883,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX900-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX900-GISEL: ; %bb.0: ; %entry ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, v4, v7, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v5 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v4 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v2, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v6, v9, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v9, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5] ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64: @@ -5935,29 +5935,29 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX90A-GISEL-LABEL: clpeak_imad_pat_i64: ; GFX90A-GISEL: ; %bb.0: ; %entry ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v6 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v7, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v2, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, v3, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, v[6:7] -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v2 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v5, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, v[0:1] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v4, v3, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v7, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v8, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, v[2:3] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v8 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, v1, v9, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v2, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v2, v[6:7] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v5, v8 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v6, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, v[0:1] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v3, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[4:5] ; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6408,52 +6408,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v2 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] +; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v14 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] +; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v16 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] +; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] +; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc ; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc +; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v13 +; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6513,52 +6513,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v2 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] +; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v14 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v16 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] +; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc ; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc -; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc +; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 1, v13 +; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6610,52 +6610,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX900-GISEL: ; %bb.0: ; %entry ; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v1, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v3, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11] -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0 -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v14 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11] +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v12, v15, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0 +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v16 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v10 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2] +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v8, v17, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v12, vcc ; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v8, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1] -; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0 -; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2] +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v9 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1] +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc +; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13 +; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v3, vcc +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0 +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5] -; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9] +; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12] ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -6707,54 +6707,54 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i64: ; GFX90A-GISEL: ; %bb.0: ; %entry ; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v3, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v5, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v4, v[2:3] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v7, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v10 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v6, v[8:9] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, v1, v11, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v8 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v2, v12 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v10, v5, 0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, v3, v13, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v4, v[10:11] -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v7, 0 -; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v6, v[10:11] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v6 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v2 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v8 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v6, v[0:1] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v9, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0 -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v4 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1] -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v5, vcc -; GFX90A-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v13, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v12, v[2:3] -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v15, 0 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[2:3] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v8 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v7, 0 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v0, v12 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v6, v[8:9] +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v1, v13, vcc +; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v10 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, v2, v14 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v5, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v3, v15, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[10:11] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, 0 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v12 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v6, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v6, v[10:11] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v10, v5, v12 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v2 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v3, vcc +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v13, vcc, 1, v8 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v9, vcc +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v5, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v5, v[0:1] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0 +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v4 +; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v6 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v11, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v11, v[0:1] +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v10, vcc +; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v7, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v14, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v13, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v13, v[4:5] +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v16, 0 ; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v14, 0 -; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v14, v[4:5] -; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0 +; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v15, v[4:5] +; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v6 ; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir index fa52b96e9ea95..02eda2c4822c2 100644 --- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir +++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -6,40 +6,12 @@ # No more registers shall be defined --- name: main -alignment: 1 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false tracksRegLiveness: true registers: - - { id: 1, class: sreg_32_xm0, preferred-register: '%1' } - - { id: 2, class: vreg_64, preferred-register: '%2' } - - { id: 3, class: vreg_64 } - - { id: 4, class: vreg_64 } - - { id: 5, class: vreg_64 } - - { id: 6, class: vreg_96 } - - { id: 7, class: vreg_96 } - - { id: 8, class: vreg_128 } - - { id: 9, class: vreg_128 } -liveins: - - { reg: '$sgpr6', virtual-reg: '%1' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false + - { id: 0, class: sreg_32_xm0, preferred-register: '%0' } + - { id: 1, class: vreg_64, preferred-register: '%1' } body: | - bb.0.entry: + bb.0: liveins: $sgpr0, $vgpr0_vgpr1 ; CHECK-LABEL: name: main @@ -59,20 +31,21 @@ body: | ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr - %3 = IMPLICIT_DEF - undef %4.sub0 = COPY $sgpr0 - %4.sub1 = COPY %3.sub0 - undef %5.sub0 = COPY %4.sub1 - %5.sub1 = COPY %4.sub0 - FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, implicit $exec, implicit $flat_scr + %2:vreg_64 = IMPLICIT_DEF + undef %3.sub0:vreg_64 = COPY $sgpr0 + %3.sub1:vreg_64 = COPY %2.sub0 + undef %4.sub0:vreg_64 = COPY %3.sub1 + %4.sub1:vreg_64 = COPY %3.sub0 + FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %4, 0, 0, implicit $exec, implicit $flat_scr - %6 = IMPLICIT_DEF - undef %7.sub0_sub1 = COPY %6 - %7.sub2 = COPY %3.sub0 - FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, implicit $exec, implicit $flat_scr + %5:vreg_96 = IMPLICIT_DEF + undef %6.sub0_sub1:vreg_96 = COPY %5 + %6.sub2:vreg_96 = COPY %2.sub0 + FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %6, 0, 0, implicit $exec, implicit $flat_scr + + %7:vreg_128 = IMPLICIT_DEF + undef %8.sub0_sub1_sub2:vreg_128 = COPY %7 + %8.sub3:vreg_128 = COPY %2.sub0 + FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr - %8 = IMPLICIT_DEF - undef %9.sub0_sub1_sub2 = COPY %8 - %9.sub3 = COPY %3.sub0 - FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, implicit $exec, implicit $flat_scr ... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index 1ab4cb0f00192..d82d6bcb437cc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -781,16 +781,23 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; GISEL12-NEXT: v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39 ; GISEL12-NEXT: s_wait_kmcnt 0x0 ; GISEL12-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GISEL12-NEXT: v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1 -; GISEL12-NEXT: v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3 -; GISEL12-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v5 -; GISEL12-NEXT: v_dual_mov_b32 v30, v6 :: v_dual_mov_b32 v31, v7 -; GISEL12-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9 -; GISEL12-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11 -; GISEL12-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13 -; GISEL12-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15 +; GISEL12-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GISEL12-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3 +; GISEL12-NEXT: v_dual_mov_b32 v44, v4 :: v_dual_mov_b32 v45, v5 +; GISEL12-NEXT: v_dual_mov_b32 v46, v6 :: v_dual_mov_b32 v47, v7 +; GISEL12-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v49, v9 +; GISEL12-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v11 +; GISEL12-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v53, v13 +; GISEL12-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v55, v15 ; GISEL12-NEXT: s_mov_b32 exec_lo, s9 -; GISEL12-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec +; GISEL12-NEXT: v_dual_mov_b32 v24, v40 :: v_dual_mov_b32 v25, v41 +; GISEL12-NEXT: v_dual_mov_b32 v26, v42 :: v_dual_mov_b32 v27, v43 +; GISEL12-NEXT: v_dual_mov_b32 v28, v44 :: v_dual_mov_b32 v29, v45 +; GISEL12-NEXT: v_dual_mov_b32 v30, v46 :: v_dual_mov_b32 v31, v47 +; GISEL12-NEXT: v_dual_mov_b32 v32, v48 :: v_dual_mov_b32 v33, v49 +; GISEL12-NEXT: v_dual_mov_b32 v34, v50 :: v_dual_mov_b32 v35, v51 +; GISEL12-NEXT: v_dual_mov_b32 v36, v52 :: v_dual_mov_b32 v37, v53 +; GISEL12-NEXT: v_dual_mov_b32 v38, v54 :: v_dual_mov_b32 v39, v55 ; GISEL12-NEXT: .LBB5_2: ; %tail ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s4 @@ -946,24 +953,39 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr ; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51] ; GISEL10-NEXT: s_waitcnt lgkmcnt(0) ; GISEL10-NEXT: s_swappc_b64 s[30:31], s[12:13] -; GISEL10-NEXT: v_mov_b32_e32 v24, v0 -; GISEL10-NEXT: v_mov_b32_e32 v25, v1 -; GISEL10-NEXT: v_mov_b32_e32 v26, v2 -; GISEL10-NEXT: v_mov_b32_e32 v27, v3 -; GISEL10-NEXT: v_mov_b32_e32 v28, v4 -; GISEL10-NEXT: v_mov_b32_e32 v29, v5 -; GISEL10-NEXT: v_mov_b32_e32 v30, v6 -; GISEL10-NEXT: v_mov_b32_e32 v31, v7 -; GISEL10-NEXT: v_mov_b32_e32 v32, v8 -; GISEL10-NEXT: v_mov_b32_e32 v33, v9 -; GISEL10-NEXT: v_mov_b32_e32 v34, v10 -; GISEL10-NEXT: v_mov_b32_e32 v35, v11 -; GISEL10-NEXT: v_mov_b32_e32 v36, v12 -; GISEL10-NEXT: v_mov_b32_e32 v37, v13 -; GISEL10-NEXT: v_mov_b32_e32 v38, v14 -; GISEL10-NEXT: v_mov_b32_e32 v39, v15 +; GISEL10-NEXT: v_mov_b32_e32 v40, v0 +; GISEL10-NEXT: v_mov_b32_e32 v41, v1 +; GISEL10-NEXT: v_mov_b32_e32 v42, v2 +; GISEL10-NEXT: v_mov_b32_e32 v43, v3 +; GISEL10-NEXT: v_mov_b32_e32 v44, v4 +; GISEL10-NEXT: v_mov_b32_e32 v45, v5 +; GISEL10-NEXT: v_mov_b32_e32 v46, v6 +; GISEL10-NEXT: v_mov_b32_e32 v47, v7 +; GISEL10-NEXT: v_mov_b32_e32 v48, v8 +; GISEL10-NEXT: v_mov_b32_e32 v49, v9 +; GISEL10-NEXT: v_mov_b32_e32 v50, v10 +; GISEL10-NEXT: v_mov_b32_e32 v51, v11 +; GISEL10-NEXT: v_mov_b32_e32 v52, v12 +; GISEL10-NEXT: v_mov_b32_e32 v53, v13 +; GISEL10-NEXT: v_mov_b32_e32 v54, v14 +; GISEL10-NEXT: v_mov_b32_e32 v55, v15 ; GISEL10-NEXT: s_mov_b32 exec_lo, s9 -; GISEL10-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec +; GISEL10-NEXT: v_mov_b32_e32 v24, v40 +; GISEL10-NEXT: v_mov_b32_e32 v25, v41 +; GISEL10-NEXT: v_mov_b32_e32 v26, v42 +; GISEL10-NEXT: v_mov_b32_e32 v27, v43 +; GISEL10-NEXT: v_mov_b32_e32 v28, v44 +; GISEL10-NEXT: v_mov_b32_e32 v29, v45 +; GISEL10-NEXT: v_mov_b32_e32 v30, v46 +; GISEL10-NEXT: v_mov_b32_e32 v31, v47 +; GISEL10-NEXT: v_mov_b32_e32 v32, v48 +; GISEL10-NEXT: v_mov_b32_e32 v33, v49 +; GISEL10-NEXT: v_mov_b32_e32 v34, v50 +; GISEL10-NEXT: v_mov_b32_e32 v35, v51 +; GISEL10-NEXT: v_mov_b32_e32 v36, v52 +; GISEL10-NEXT: v_mov_b32_e32 v37, v53 +; GISEL10-NEXT: v_mov_b32_e32 v38, v54 +; GISEL10-NEXT: v_mov_b32_e32 v39, v55 ; GISEL10-NEXT: .LBB5_2: ; %tail ; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GISEL10-NEXT: v_mov_b32_e32 v8, v24 diff --git a/llvm/test/CodeGen/AMDGPU/private-function.ll b/llvm/test/CodeGen/AMDGPU/private-function.ll new file mode 100644 index 0000000000000..8eefc9dfc5d7e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/private-function.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define private void @foo() { +; CHECK-LABEL: foo: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + ret void +} + +@var = global ptr @foo diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll deleted file mode 100644 index 05a0e39d4a715..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll +++ /dev/null @@ -1,325 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s - -define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 { -; OPT-LABEL: define amdgpu_kernel void @large_array_vectors_small_users( -; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { -; OPT-NEXT: [[ENTRY:.*:]] -; OPT-NEXT: [[ALLOCA:%.*]] = freeze <128 x i8> poison -; OPT-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP1:%.*]] = insertelement <128 x i8> [[ALLOCA]], i8 [[TMP0]], i32 0 -; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1 -; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2 -; OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3 -; OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4 -; OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5 -; OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6 -; OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7 -; OPT-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8 -; OPT-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9 -; OPT-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10 -; OPT-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11 -; OPT-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12 -; OPT-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13 -; OPT-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14 -; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15 -; OPT-NEXT: [[TMP32:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP33:%.*]] = insertelement <128 x i8> [[TMP31]], i8 [[TMP32]], i32 0 -; OPT-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 1 -; OPT-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 2 -; OPT-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 3 -; OPT-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 4 -; OPT-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 5 -; OPT-NEXT: [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 6 -; OPT-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 7 -; OPT-NEXT: [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 8 -; OPT-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 9 -; OPT-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 10 -; OPT-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 11 -; OPT-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 12 -; OPT-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 13 -; OPT-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 14 -; OPT-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 15 -; OPT-NEXT: [[TMP64:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP65:%.*]] = insertelement <128 x i8> [[TMP63]], i8 [[TMP64]], i32 0 -; OPT-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 1 -; OPT-NEXT: [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 2 -; OPT-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 3 -; OPT-NEXT: [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 4 -; OPT-NEXT: [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 5 -; OPT-NEXT: [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 6 -; OPT-NEXT: [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 7 -; OPT-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 8 -; OPT-NEXT: [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 9 -; OPT-NEXT: [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 10 -; OPT-NEXT: [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 11 -; OPT-NEXT: [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 12 -; OPT-NEXT: [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 13 -; OPT-NEXT: [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 14 -; OPT-NEXT: [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 15 -; OPT-NEXT: [[TMP96:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP97:%.*]] = insertelement <128 x i8> [[TMP95]], i8 [[TMP96]], i32 0 -; OPT-NEXT: [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 1 -; OPT-NEXT: [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 2 -; OPT-NEXT: [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 3 -; OPT-NEXT: [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 4 -; OPT-NEXT: [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 5 -; OPT-NEXT: [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 6 -; OPT-NEXT: [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 7 -; OPT-NEXT: [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 8 -; OPT-NEXT: [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 9 -; OPT-NEXT: [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 10 -; OPT-NEXT: [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 11 -; OPT-NEXT: [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 12 -; OPT-NEXT: [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 13 -; OPT-NEXT: [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 14 -; OPT-NEXT: [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 15 -; OPT-NEXT: [[TMP128:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP129:%.*]] = insertelement <128 x i8> [[TMP127]], i8 [[TMP128]], i32 0 -; OPT-NEXT: [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 1 -; OPT-NEXT: [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 2 -; OPT-NEXT: [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 3 -; OPT-NEXT: [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 4 -; OPT-NEXT: [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 5 -; OPT-NEXT: [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 6 -; OPT-NEXT: [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 7 -; OPT-NEXT: [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 8 -; OPT-NEXT: [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 9 -; OPT-NEXT: [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 10 -; OPT-NEXT: [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 11 -; OPT-NEXT: [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 12 -; OPT-NEXT: [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 13 -; OPT-NEXT: [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 14 -; OPT-NEXT: [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 15 -; OPT-NEXT: [[TMP160:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP161:%.*]] = insertelement <128 x i8> [[TMP159]], i8 [[TMP160]], i32 0 -; OPT-NEXT: [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 1 -; OPT-NEXT: [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 2 -; OPT-NEXT: [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 3 -; OPT-NEXT: [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 4 -; OPT-NEXT: [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 5 -; OPT-NEXT: [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 6 -; OPT-NEXT: [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 7 -; OPT-NEXT: [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 8 -; OPT-NEXT: [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 9 -; OPT-NEXT: [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 10 -; OPT-NEXT: [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 11 -; OPT-NEXT: [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 12 -; OPT-NEXT: [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 13 -; OPT-NEXT: [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 14 -; OPT-NEXT: [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 15 -; OPT-NEXT: [[TMP192:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP193:%.*]] = insertelement <128 x i8> [[TMP191]], i8 [[TMP192]], i32 0 -; OPT-NEXT: [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 1 -; OPT-NEXT: [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 2 -; OPT-NEXT: [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 3 -; OPT-NEXT: [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 4 -; OPT-NEXT: [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 5 -; OPT-NEXT: [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 6 -; OPT-NEXT: [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 7 -; OPT-NEXT: [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 8 -; OPT-NEXT: [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 9 -; OPT-NEXT: [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 10 -; OPT-NEXT: [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 11 -; OPT-NEXT: [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 12 -; OPT-NEXT: [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 13 -; OPT-NEXT: [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 14 -; OPT-NEXT: [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 15 -; OPT-NEXT: [[TMP224:%.*]] = extractelement <16 x i8> [[IN]], i64 0 -; OPT-NEXT: [[TMP225:%.*]] = insertelement <128 x i8> [[TMP223]], i8 [[TMP224]], i32 0 -; OPT-NEXT: [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1 -; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 1 -; OPT-NEXT: [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2 -; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 2 -; OPT-NEXT: [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3 -; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 3 -; OPT-NEXT: [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4 -; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 4 -; OPT-NEXT: [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5 -; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 5 -; OPT-NEXT: [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6 -; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 6 -; OPT-NEXT: [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7 -; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 7 -; OPT-NEXT: [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8 -; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 8 -; OPT-NEXT: [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9 -; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 9 -; OPT-NEXT: [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10 -; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 10 -; OPT-NEXT: [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11 -; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 11 -; OPT-NEXT: [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12 -; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 12 -; OPT-NEXT: [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13 -; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 13 -; OPT-NEXT: [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14 -; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 14 -; OPT-NEXT: [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15 -; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 15 -; OPT-NEXT: [[TMP256:%.*]] = extractelement <128 x i8> [[TMP255]], i32 80 -; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> poison, i8 [[TMP256]], i64 0 -; OPT-NEXT: [[TMP258:%.*]] = extractelement <128 x i8> [[TMP255]], i32 81 -; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP258]], i64 1 -; OPT-NEXT: [[TMP260:%.*]] = extractelement <128 x i8> [[TMP255]], i32 82 -; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP260]], i64 2 -; OPT-NEXT: [[TMP262:%.*]] = extractelement <128 x i8> [[TMP255]], i32 83 -; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP262]], i64 3 -; OPT-NEXT: [[TMP264:%.*]] = extractelement <128 x i8> [[TMP255]], i32 84 -; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP264]], i64 4 -; OPT-NEXT: [[TMP266:%.*]] = extractelement <128 x i8> [[TMP255]], i32 85 -; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP266]], i64 5 -; OPT-NEXT: [[TMP268:%.*]] = extractelement <128 x i8> [[TMP255]], i32 86 -; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP268]], i64 6 -; OPT-NEXT: [[TMP270:%.*]] = extractelement <128 x i8> [[TMP255]], i32 87 -; OPT-NEXT: [[TMP271:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP270]], i64 7 -; OPT-NEXT: [[TMP272:%.*]] = extractelement <128 x i8> [[TMP255]], i32 88 -; OPT-NEXT: [[TMP273:%.*]] = insertelement <16 x i8> [[TMP271]], i8 [[TMP272]], i64 8 -; OPT-NEXT: [[TMP274:%.*]] = extractelement <128 x i8> [[TMP255]], i32 89 -; OPT-NEXT: [[TMP275:%.*]] = insertelement <16 x i8> [[TMP273]], i8 [[TMP274]], i64 9 -; OPT-NEXT: [[TMP276:%.*]] = extractelement <128 x i8> [[TMP255]], i32 90 -; OPT-NEXT: [[TMP277:%.*]] = insertelement <16 x i8> [[TMP275]], i8 [[TMP276]], i64 10 -; OPT-NEXT: [[TMP278:%.*]] = extractelement <128 x i8> [[TMP255]], i32 91 -; OPT-NEXT: [[TMP279:%.*]] = insertelement <16 x i8> [[TMP277]], i8 [[TMP278]], i64 11 -; OPT-NEXT: [[TMP280:%.*]] = extractelement <128 x i8> [[TMP255]], i32 92 -; OPT-NEXT: [[TMP281:%.*]] = insertelement <16 x i8> [[TMP279]], i8 [[TMP280]], i64 12 -; OPT-NEXT: [[TMP282:%.*]] = extractelement <128 x i8> [[TMP255]], i32 93 -; OPT-NEXT: [[TMP283:%.*]] = insertelement <16 x i8> [[TMP281]], i8 [[TMP282]], i64 13 -; OPT-NEXT: [[TMP284:%.*]] = extractelement <128 x i8> [[TMP255]], i32 94 -; OPT-NEXT: [[TMP285:%.*]] = insertelement <16 x i8> [[TMP283]], i8 [[TMP284]], i64 14 -; OPT-NEXT: [[TMP286:%.*]] = extractelement <128 x i8> [[TMP255]], i32 95 -; OPT-NEXT: [[TMP287:%.*]] = insertelement <16 x i8> [[TMP285]], i8 [[TMP286]], i64 15 -; OPT-NEXT: [[SUM:%.*]] = add <16 x i8> [[TMP287]], [[ADD]] -; OPT-NEXT: store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16 -; OPT-NEXT: ret void -; -entry: - %alloca = alloca [8 x <16 x i8>], align 16, addrspace(5) - %gep0 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 0 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep1 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 1 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep2 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 2 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep3 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 3 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep4 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 4 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep5 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 5 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep6 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 6 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %gep7 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 7 - store <16 x i8> %in, ptr addrspace(5) %gep0, align 16 - %load = load <16 x i8>, ptr addrspace(5) %gep5, align 16 - %sum = add <16 x i8> %load, %add - store <16 x i8> %sum, ptr addrspace(3) %out, align 16 - ret void -} - -attributes #0 = {"amdgpu-waves-per-eu"="2,2"} diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir new file mode 100644 index 0000000000000..381cb8c9d1047 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir @@ -0,0 +1,131 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=register-coalescer -verify-coalescing -o - %s | FileCheck %s + +# This test is to check fix for failure with "Bad machine code: Defining instruction does not modify register" due to corrupt lane mask. + +--- +name: reg_coalescer_subreg_liveness +tracksRegLiveness: true +liveins: +body: | + ; CHECK-LABEL: name: reg_coalescer_subreg_liveness + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0 + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: TENSOR_LOAD_TO_LDS_D2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + ; CHECK-NEXT: $vcc_lo = COPY $exec_lo + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc_lo + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x80000000) + liveins: $sgpr4_sgpr5 + + %0:sgpr_64 = COPY killed $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + %2:sreg_32 = S_MOV_B32 1 + undef %3.sub0:sgpr_128 = COPY %2 + %4:sreg_32 = S_MOV_B32 0 + undef %5.sub0:sgpr_256 = COPY %4 + TENSOR_LOAD_TO_LDS_D2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + %6:sgpr_128 = COPY killed %3 + %6.sub1:sgpr_128 = COPY killed %1 + %7:sreg_32 = COPY $exec_lo + %8:sreg_32 = COPY %2 + %9:sreg_32 = COPY %4 + + bb.1: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + + %10:sreg_32 = COPY killed %8 + undef %11.sub0:sgpr_128 = COPY %2 + %11.sub1:sgpr_128 = COPY killed %10 + %11.sub2:sgpr_128 = COPY %2 + %11.sub3:sgpr_128 = COPY %2 + TENSOR_LOAD_TO_LDS_D2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + %12:sreg_32 = COPY killed %9 + %13:sgpr_128 = COPY %6 + %13.sub2:sgpr_128 = COPY killed %12 + TENSOR_LOAD_TO_LDS_D2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt + $vcc_lo = COPY %7 + %8:sreg_32 = COPY %4 + %9:sreg_32 = COPY %2 + S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... +--- +name: reg_coalescer_subreg_liveness_2 +tracksRegLiveness: true +liveins: +body: | + ; CHECK-LABEL: name: reg_coalescer_subreg_liveness_2 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK-NEXT: undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1 + ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit [[S_LOAD_DWORD_IMM]], implicit [[S_MOV_B32_]] + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x80000000) + liveins: $sgpr4_sgpr5 + + %0:sgpr_64 = COPY killed $sgpr4_sgpr5 + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %3:sreg_32 = S_MOV_B32 1 + undef %4.sub0:sgpr_128 = COPY %3 + %5:sgpr_128 = COPY %4 + %5.sub1:sgpr_128 = COPY killed %2 + %6:sgpr_128 = COPY %5 + %6.sub2:sgpr_128 = COPY killed %1 + %7:sreg_32 = S_MOV_B32 0 + undef %8.sub0:sgpr_256 = COPY %7 + %9:sreg_32 = COPY %3 + + bb.1: + successors: %bb.2(0x80000000) + + %10:sreg_32 = COPY killed %9 + undef %11.sub0:sgpr_128 = COPY %3 + %11.sub1:sgpr_128 = COPY killed %10 + S_NOP 0, implicit %5, implicit %8 + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir index 002d43f937837..131656975ec40 100644 --- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir +++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX --- @@ -40,6 +41,27 @@ body: | S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ... +--- +name: meta_in_between +body: | + bb.0: + ; GCN-LABEL: name: meta_in_between + ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode + ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: KILL $sgpr0 + ; GCN-NEXT: $sgpr0 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode + $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + KILL $sgpr0 + $sgpr0 = IMPLICIT_DEF + S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode + $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode +... + --- name: valu_write_in_between body: | diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll index 34de1e48bfb59..01bcdad3fc220 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -3,15 +3,16 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA define void @nested_inf_loop(i1 %0, i1 %1) { -; OPT-LABEL: @nested_inf_loop( -; OPT-NEXT: BB: -; OPT-NEXT: br label [[BB1:%.*]] -; OPT: BB1: -; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]] -; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]] -; OPT: infloop: -; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]] -; OPT: DummyReturnBlock: +; OPT-LABEL: define void @nested_inf_loop( +; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: br label %[[BB1:.*]] +; OPT: [[BB1]]: +; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]] +; OPT-NEXT: br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]] +; OPT: [[INFLOOP]]: +; OPT-NEXT: br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT: [[DUMMYRETURNBLOCK]]: ; OPT-NEXT: ret void ; ; ISA-LABEL: nested_inf_loop: @@ -63,3 +64,84 @@ BB4: BB3: br label %BB1 } + +define void @nested_inf_loop_callbr(i32 %0, i32 %1) { +; OPT-LABEL: define void @nested_inf_loop_callbr( +; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) { +; OPT-NEXT: [[BB:.*:]] +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB1:.*]] [] +; OPT: [[BB1]]: +; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP0]]) +; OPT-NEXT: to label %[[BB3:.*]] [label %BB2] +; OPT: [[BB2:.*:]] +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB4:.*]] [] +; OPT: [[BB4]]: +; OPT-NEXT: br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT: [[TRANSITIONBLOCK]]: +; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP1]]) +; OPT-NEXT: to label %[[BB3]] [label %BB4] +; OPT: [[BB3]]: +; OPT-NEXT: callbr void asm "", ""() +; OPT-NEXT: to label %[[BB1]] [] +; OPT: [[DUMMYRETURNBLOCK]]: +; OPT-NEXT: ret void +; +; ISA-LABEL: nested_inf_loop_callbr: +; ISA: ; %bb.0: ; %BB +; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: ; implicit-def: $sgpr6_sgpr7 +; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5 +; ISA-NEXT: .LBB1_1: ; %BB1 +; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; ISA-NEXT: s_and_b64 s[8:9], s[4:5], exec +; ISA-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; ISA-NEXT: .LBB1_2: ; %BB3 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_andn2_b64 s[4:5], s[4:5], exec +; ISA-NEXT: s_and_b64 s[8:9], s[6:7], exec +; ISA-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; ISA-NEXT: s_branch .LBB1_1 +; ISA-NEXT: .LBB1_3: ; Inline asm indirect target +; ISA-NEXT: ; %BB2 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: ; Label of block must be emitted +; ISA-NEXT: ;;#ASMSTART +; ISA-NEXT: ;;#ASMEND +; ISA-NEXT: s_mov_b64 s[6:7], -1 +; ISA-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; ISA-NEXT: s_cbranch_execz .LBB1_5 +; ISA-NEXT: ; %bb.4: ; %TransitionBlock.target.BB3 +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: s_xor_b64 s[6:7], exec, -1 +; ISA-NEXT: .LBB1_5: ; %loop.exit.guard +; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT: s_or_b64 exec, exec, s[8:9] +; ISA-NEXT: s_and_b64 vcc, exec, s[6:7] +; ISA-NEXT: s_mov_b64 s[6:7], 0 +; ISA-NEXT: s_cbranch_vccz .LBB1_2 +; ISA-NEXT: ; %bb.6: ; %DummyReturnBlock +; ISA-NEXT: s_setpc_b64 s[30:31] +BB: + callbr void asm "", ""() to label %BB1 [] + +BB1: + callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2] + +BB2: + callbr void asm "", ""() to label %BB4 [] + +BB4: + callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4] + +BB3: + callbr void asm "", ""() to label %BB1 [] +} diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 4cbe682cf9f9f..004c27971131d 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s declare void @llvm.trap() @@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { ; CHECK-NEXT: s_mov_b64 s[2:3], -1 ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: s_branch .LBB0_4 - - +; UNIFY-LABEL: @kernel( +; UNIFY-NEXT: entry: +; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; UNIFY: if.then: +; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT: br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]] +; UNIFY: cond.false: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.else: +; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT: br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]] +; UNIFY: if.then3: +; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT: br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]] +; UNIFY: cond.false.i8: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.end6.sink.split: +; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT: br label [[IF_END6]] +; UNIFY: if.end6: +; UNIFY-NEXT: ret void +; entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %cmp = icmp eq i32 %n, 256 @@ -105,5 +130,129 @@ if.end6.sink.split: if.end6: ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; UNIFY: {{.*}} + +define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { +; CHECK-LABEL: kernel_callbr: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s1, s[8:9], 0x10 +; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmpk_eq_i32 s1, 0x100 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.1: ; %if.then +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: .LBB1_2: ; %if.end6.sink.split +; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, s0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dword v0, v1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: .LBB1_3: ; Inline asm indirect target +; CHECK-NEXT: ; %UnifiedReturnBlock +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: .LBB1_4: ; Inline asm indirect target +; CHECK-NEXT: ; %if.else +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: ; %bb.5: ; %if.then3 +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_branch .LBB1_2 +; CHECK-NEXT: .LBB1_6: ; Inline asm indirect target +; CHECK-NEXT: ; %cond.false.i8 +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: .LBB1_7: ; Inline asm indirect target +; CHECK-NEXT: ; %cond.false +; CHECK-NEXT: ; Label of block must be emitted +; CHECK-NEXT: s_trap 2 +; CHECK-NEXT: ; divergent unreachable +; CHECK-NEXT: s_branch .LBB1_3 +; UNIFY-LABEL: @kernel_callbr( +; UNIFY-NEXT: entry: +; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT: [[CMP32:%.*]] = zext i1 [[CMP]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP32]]) +; UNIFY-NEXT: to label [[IF_THEN:%.*]] [label %if.else] +; UNIFY: if.then: +; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT: [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_32]]) +; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false] +; UNIFY: cond.false: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.else: +; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT: [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP2_32]]) +; UNIFY-NEXT: to label [[IF_THEN3:%.*]] [label %if.end6] +; UNIFY: if.then3: +; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT: [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32 +; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]]) +; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8] +; UNIFY: cond.false.i8: +; UNIFY-NEXT: call void @llvm.trap() +; UNIFY-NEXT: unreachable +; UNIFY: if.end6.sink.split: +; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT: callbr void asm "", ""() +; UNIFY-NEXT: to label [[IF_END6:%.*]] [] +; UNIFY: if.end6: +; UNIFY-NEXT: ret void +; +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %cmp = icmp eq i32 %n, 256 + %cmp32 = zext i1 %cmp to i32 + callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else] + +if.then: + %cmp1 = icmp eq i32 %a, 0 + %cmp1_32 = zext i1 %cmp1 to i32 + callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false] + +cond.false: + call void @llvm.trap() + unreachable + +if.else: + %cmp2 = icmp ult i32 %tid, 10 + %cmp2_32 = zext i1 %cmp2 to i32 + callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6] + +if.then3: + %cmp1.i7 = icmp eq i32 %a, 0 + %cmp1.i7_32 = zext i1 %cmp1.i7 to i32 + callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8] + +cond.false.i8: + call void @llvm.trap() + unreachable + +if.end6.sink.split: + %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid + store i32 %a, ptr addrspace(1) %x1, align 4 + callbr void asm "", ""() to label %if.end6 [] + +if.end6: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll index 50666bee325e8..684dc1a1f0092 100644 --- a/llvm/test/CodeGen/AMDGPU/update-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll @@ -37,3 +37,42 @@ n28: ; preds = %.loopexit, %n28 n31: ; preds = ret void } + +define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 { +; IR-LABEL: @_amdgpu_ps_main_callbr( +; IR-NEXT: .entry: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[DOTLOOPEXIT:%.*]] [] +; IR: .loopexit: +; IR-NEXT: callbr void asm "", ""() +; IR-NEXT: to label [[N28:%.*]] [] +; IR: n28: +; IR-NEXT: [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ] +; IR-NEXT: [[N29]] = fadd float [[DOT01]], 1.000000e+00 +; IR-NEXT: [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00 +; IR-NEXT: [[N30_32:%.*]] = zext i1 [[N30]] to i32 +; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]] +; IR: TransitionBlock: +; IR-NEXT: callbr void asm "", "r,!i"(i32 [[N30_32]]) +; IR-NEXT: to label [[DOTLOOPEXIT]] [label %n28] +; IR: n31: +; IR-NEXT: ret void +; IR: DummyReturnBlock: +; IR-NEXT: ret void +; +.entry: + callbr void asm "", ""() to label %.loopexit [] + +.loopexit: ; preds = %n28, %.entry + callbr void asm "", ""() to label %n28 [] + +n28: ; preds = %.loopexit, %n28 + %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ] + %n29 = fadd float %.01, 1.0 + %n30 = fcmp ogt float %n29, 4.000000e+00 + %n30.32 = zext i1 %n30 to i32 + callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28] + +n31: ; preds = + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll index 4d5ade4abcef7..1b4ed67eb6eea 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll @@ -2481,10 +2481,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2502,10 +2503,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v1 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2524,8 +2526,8 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) { ; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v4, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v2, v[6:7] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v5, v2, v[6:7] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v8 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v2i64: @@ -2626,9 +2628,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v8 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2654,9 +2656,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2677,12 +2679,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v2, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v1, v2, v[8:9] +; GFX9-GISEL-NEXT: v_add_u32_e32 v8, v7, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v4, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v4, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v3i64: @@ -2816,10 +2818,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2853,10 +2855,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v12 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -2881,16 +2883,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v0, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v4, v[10:11] -; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v7, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, v6, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v3, v6, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v5, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v2, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v3, v6, v[0:1] +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v12 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v8, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v9, v4, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v9, v4, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v4i64: @@ -3068,31 +3070,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v21 ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v8 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3139,31 +3139,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v13 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v13, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v21 ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6] ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v18 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v7 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v8 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3204,32 +3202,32 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) { ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v0, v9, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v8, v[18:19] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v1, v8, v[18:19] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v11, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v10, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v3, v10, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v13, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v12, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v4 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v5, v12, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v15, 0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v18 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v14, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v14, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v7, v14, v[8:9] +; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v20 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v16, v1, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v6 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v2, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v16, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v17, v2, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v2 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v5, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v10, v4, v[0:1] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v4, v[8:9] ; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v7, v10 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v2, v[4:5] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, v2, v[4:5] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v8i64: @@ -3550,63 +3548,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX7-GISEL: ; %bb.0: ; %entry ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] -; GFX7-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] +; GFX7-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v32 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] ; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v10 ; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v13 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 ; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] +; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] +; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] ; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3695,63 +3693,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX8-GISEL: ; %bb.0: ; %entry ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v33 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v24, v25 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2] -; GFX8-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v17 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2] +; GFX8-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v32 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26] ; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v10 ; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v13 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0 ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2] -; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v10 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0 +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2] +; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5] ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64: @@ -3827,65 +3825,65 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) { ; GFX9-GISEL-NEXT: scratch_load_dword v31, off, s32 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v0, v17, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[0:1], v0, v16, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v16, v[34:35] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[0:1], v1, v16, v[34:35] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v2, v19, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v18, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v3, v18, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v4, v21, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v20, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v20, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v5, v20, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v6, v23, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v19, v3, v4 +; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v3, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v22, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v22, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v7, v22, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v8, v25, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v5, v6 +; GFX9-GISEL-NEXT: v_add_u32_e32 v21, v5, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v24, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v9, v24, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v24, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v10, v27, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v8 +; GFX9-GISEL-NEXT: v_add_u32_e32 v34, v1, v34 +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v10, v26, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v11, v26, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v11, v26, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v12, v29, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v28, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v13, v28, v[16:17] -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v12 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v13, v28, v[16:17] +; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v36 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v18 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v14, v30, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v14, v31, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v15, v30, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v30, v[16:17] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v32, v1, 0 -; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v13, v14 +; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v13, v18 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v33, v6, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v32, v6, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v33, v6, v[16:17] -; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v15, v6 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v3, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v8, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v34, v8, v[16:17] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v5, 0 +; GFX9-GISEL-NEXT: v_add_u32_e32 v13, v15, v18 +; GFX9-GISEL-NEXT: v_add_u32_e32 v15, v7, v0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v5, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v19, v10, v[2:3] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v20, v10, v[8:9] +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v11, 0 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v4, v9, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v20, v12, v[4:5] -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v21, v12, v[8:9] ; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v14, v0, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v11, v0, v[8:9] -; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v13, v0, v[8:9] ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v3, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v2, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v2, v[0:1] -; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v9, v0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v15, v2, v[0:1] +; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v5, v10 +; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v9, v6 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 ; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v8, 0 -; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v8, v[2:3] -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v10, v8, v[2:3] +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v16i64: diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir index a1381ecad81e2..f964480dcc633 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -1069,6 +1069,51 @@ body: | $sgpr0 = S_MOV_B32 $sgpr0 ... +# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0. +--- +name: mixed_pending_events +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: mixed_pending_events + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr2, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_LOADCNT 1 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $sgpr2 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + bb.2: + liveins: $sgpr2, $vgpr2 + $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec + $sgpr2 = S_MOV_B32 $sgpr2 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec +... + --- name: pending_vmem_event_between_block tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir new file mode 100644 index 0000000000000..df3e780c61f46 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir @@ -0,0 +1,42 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -run-pass=machine-sink %s -o - | FileCheck %s + +--- +name: wmma_test +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: wmma_test + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, [[DEF]], 8, [[DEF1]], 8, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[DEF2]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY %3.sub1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + %0:vreg_128 = IMPLICIT_DEF + %1:vreg_128 = IMPLICIT_DEF + %2:sreg_32 = IMPLICIT_DEF + early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, %0:vreg_128, 8, %1:vreg_128, 8, 0, 0, 0, implicit $exec + %4:sreg_32 = SI_IF %2:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %5:vreg_256 = COPY %3.sub1:vreg_256 + + bb.2: + SI_END_CF %4:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/ARM/llvm.sincos.ll b/llvm/test/CodeGen/ARM/llvm.sincos.ll index 9628405df6bcb..1448fac8d864f 100644 --- a/llvm/test/CodeGen/ARM/llvm.sincos.ll +++ b/llvm/test/CodeGen/ARM/llvm.sincos.ll @@ -1,223 +1,1004 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s +; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefix=GNU %s +; RUN: llc -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 < %s | FileCheck -check-prefix=GNUEABI %s +; RUN: llc -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-NO-STRET %s +; RUN: llc -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-WITH-STRET %s +; RUN: llc -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefix=WATCHABI %s define { half, half } @test_sincos_f16(half %a) { -; CHECK-LABEL: test_sincos_f16: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp, #4] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: ldr r0, [sp] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: mov r1, r0 -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r4, pc} +; GNU-LABEL: test_sincos_f16: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp, #4] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: ldr r0, [sp] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: mov r1, r0 +; GNU-NEXT: mov r0, r4 +; GNU-NEXT: add sp, #8 +; GNU-NEXT: pop {r4, pc} +; +; GNUEABI-LABEL: test_sincos_f16: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, lr} +; GNUEABI-NEXT: push {r4, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #4] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: ldr r0, [sp] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: mov r1, r0 +; GNUEABI-NEXT: mov r0, r4 +; GNUEABI-NEXT: add sp, sp, #8 +; GNUEABI-NEXT: pop {r4, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f16: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, lr} +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: mov r1, r0 +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: pop {r4, r5, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_f16: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {r4, r5, lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldm sp, {r0, r4} +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: mov r5, r0 +; IOS-WITH-STRET-NEXT: mov r0, r4 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, r5 +; IOS-WITH-STRET-NEXT: add sp, sp, #8 +; IOS-WITH-STRET-NEXT: pop {r4, r5, pc} +; +; WATCHABI-LABEL: test_sincos_f16: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s0 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: vcvtb.f16.f32 s1, s1 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { half, half } @llvm.sincos.f16(half %a) ret { half, half } %result } define half @test_sincos_f16_only_use_sin(half %a) { -; CHECK-LABEL: test_sincos_f16_only_use_sin: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp, #4] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f16_only_use_sin: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp, #4] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: add sp, #8 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f16_only_use_sin: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #4] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: add sp, sp, #8 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_sin: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {lr} +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: pop {lr} +; IOS-NO-STRET-NEXT: bx lr +; +; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_sin: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldr r0, [sp] +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: add sp, sp, #8 +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f16_only_use_sin: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s0 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { half, half } @llvm.sincos.f16(half %a) %result.0 = extractvalue { half, half } %result, 0 ret half %result.0 } define half @test_sincos_f16_only_use_cos(half %a) { -; CHECK-LABEL: test_sincos_f16_only_use_cos: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f16_only_use_cos: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: add sp, #8 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f16_only_use_cos: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: add sp, sp, #8 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_cos: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {lr} +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: pop {lr} +; IOS-NO-STRET-NEXT: bx lr +; +; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_cos: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldr r0, [sp, #4] +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: add sp, sp, #8 +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f16_only_use_cos: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s0 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s1 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { half, half } @llvm.sincos.f16(half %a) %result.1 = extractvalue { half, half } %result, 1 ret half %result.1 } define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) { -; CHECK-LABEL: test_sincos_v2f16: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #12 -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: mov r0, r4 -; CHECK-NEXT: bl __gnu_h2f_ieee -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldr r0, [sp, #12] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: ldr r1, [sp, #4] -; CHECK-NEXT: strh.w r0, [sp, #22] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: strh.w r0, [sp, #20] -; CHECK-NEXT: add r0, sp, #20 -; CHECK-NEXT: vld1.32 {d8[0]}, [r0:32] -; CHECK-NEXT: ldr r0, [sp, #8] -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: ldr r1, [sp] -; CHECK-NEXT: strh.w r0, [sp, #18] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: bl __gnu_f2h_ieee -; CHECK-NEXT: strh.w r0, [sp, #16] -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: vmovl.u16 q9, d8 -; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] -; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vmov.32 r0, d18[0] -; CHECK-NEXT: vmov.32 r1, d18[1] -; CHECK-NEXT: vmov.32 r2, d16[0] -; CHECK-NEXT: vmov.32 r3, d16[1] -; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: pop {r4, pc} +; GNU-LABEL: test_sincos_v2f16: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, lr} +; GNU-NEXT: vpush {d8} +; GNU-NEXT: sub sp, #24 +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #12 +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: bl sincosf +; GNU-NEXT: mov r0, r4 +; GNU-NEXT: bl __gnu_h2f_ieee +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldr r0, [sp, #12] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: ldr r1, [sp, #4] +; GNU-NEXT: strh.w r0, [sp, #22] +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: strh.w r0, [sp, #20] +; GNU-NEXT: add r0, sp, #20 +; GNU-NEXT: vld1.32 {d8[0]}, [r0:32] +; GNU-NEXT: ldr r0, [sp, #8] +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: ldr r1, [sp] +; GNU-NEXT: strh.w r0, [sp, #18] +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: bl __gnu_f2h_ieee +; GNU-NEXT: strh.w r0, [sp, #16] +; GNU-NEXT: add r0, sp, #16 +; GNU-NEXT: vmovl.u16 q9, d8 +; GNU-NEXT: vld1.32 {d16[0]}, [r0:32] +; GNU-NEXT: vmovl.u16 q8, d16 +; GNU-NEXT: vmov.32 r0, d18[0] +; GNU-NEXT: vmov.32 r1, d18[1] +; GNU-NEXT: vmov.32 r2, d16[0] +; GNU-NEXT: vmov.32 r3, d16[1] +; GNU-NEXT: add sp, #24 +; GNU-NEXT: vpop {d8} +; GNU-NEXT: pop {r4, pc} +; +; GNUEABI-LABEL: test_sincos_v2f16: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, lr} +; GNUEABI-NEXT: push {r4, lr} +; GNUEABI-NEXT: .vsave {d8} +; GNUEABI-NEXT: vpush {d8} +; GNUEABI-NEXT: .pad #24 +; GNUEABI-NEXT: sub sp, sp, #24 +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #12 +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: mov r0, r4 +; GNUEABI-NEXT: bl __gnu_h2f_ieee +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #12] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: ldr r1, [sp, #4] +; GNUEABI-NEXT: strh r0, [sp, #22] +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: strh r0, [sp, #20] +; GNUEABI-NEXT: add r0, sp, #20 +; GNUEABI-NEXT: vld1.32 {d8[0]}, [r0:32] +; GNUEABI-NEXT: ldr r0, [sp, #8] +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: ldr r1, [sp] +; GNUEABI-NEXT: strh r0, [sp, #18] +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: bl __gnu_f2h_ieee +; GNUEABI-NEXT: strh r0, [sp, #16] +; GNUEABI-NEXT: add r0, sp, #16 +; GNUEABI-NEXT: vmovl.u16 q9, d8 +; GNUEABI-NEXT: vld1.32 {d16[0]}, [r0:32] +; GNUEABI-NEXT: vmovl.u16 q8, d16 +; GNUEABI-NEXT: vmov.32 r0, d18[0] +; GNUEABI-NEXT: vmov.32 r1, d18[1] +; GNUEABI-NEXT: vmov.32 r2, d16[0] +; GNUEABI-NEXT: vmov.32 r3, d16[1] +; GNUEABI-NEXT: add sp, sp, #24 +; GNUEABI-NEXT: vpop {d8} +; GNUEABI-NEXT: pop {r4, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_v2f16: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, lr} +; IOS-NO-STRET-NEXT: vpush {d8} +; IOS-NO-STRET-NEXT: sub sp, sp, #8 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r1 +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp, #6] +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: bl ___extendhfsf2 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp, #4] +; IOS-NO-STRET-NEXT: add r0, sp, #4 +; IOS-NO-STRET-NEXT: vld1.32 {d8[0]}, [r0:32] +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp, #2] +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: bl ___truncsfhf2 +; IOS-NO-STRET-NEXT: strh r0, [sp] +; IOS-NO-STRET-NEXT: mov r0, sp +; IOS-NO-STRET-NEXT: vld1.32 {d16[0]}, [r0:32] +; IOS-NO-STRET-NEXT: vmovl.u16 q9, d8 +; IOS-NO-STRET-NEXT: vmovl.u16 q8, d16 +; IOS-NO-STRET-NEXT: vmov.32 r0, d18[0] +; IOS-NO-STRET-NEXT: vmov.32 r1, d18[1] +; IOS-NO-STRET-NEXT: vmov.32 r2, d16[0] +; IOS-NO-STRET-NEXT: vmov.32 r3, d16[1] +; IOS-NO-STRET-NEXT: add sp, sp, #8 +; IOS-NO-STRET-NEXT: vpop {d8} +; IOS-NO-STRET-NEXT: pop {r4, r5, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_v2f16: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {r4, r5, lr} +; IOS-WITH-STRET-NEXT: vpush {d8} +; IOS-WITH-STRET-NEXT: sub sp, sp, #24 +; IOS-WITH-STRET-NEXT: mov r4, r0 +; IOS-WITH-STRET-NEXT: mov r0, r1 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: add r0, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: mov r0, r4 +; IOS-WITH-STRET-NEXT: bl ___extendhfsf2 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: ldr r0, [sp, #8] +; IOS-WITH-STRET-NEXT: ldr r4, [sp, #12] +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: ldm sp, {r1, r5} +; IOS-WITH-STRET-NEXT: strh r0, [sp, #22] +; IOS-WITH-STRET-NEXT: mov r0, r1 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: strh r0, [sp, #20] +; IOS-WITH-STRET-NEXT: add r0, sp, #20 +; IOS-WITH-STRET-NEXT: vld1.32 {d8[0]}, [r0:32] +; IOS-WITH-STRET-NEXT: mov r0, r4 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: strh r0, [sp, #18] +; IOS-WITH-STRET-NEXT: mov r0, r5 +; IOS-WITH-STRET-NEXT: bl ___truncsfhf2 +; IOS-WITH-STRET-NEXT: strh r0, [sp, #16] +; IOS-WITH-STRET-NEXT: add r0, sp, #16 +; IOS-WITH-STRET-NEXT: vmovl.u16 q9, d8 +; IOS-WITH-STRET-NEXT: vld1.32 {d16[0]}, [r0:32] +; IOS-WITH-STRET-NEXT: vmovl.u16 q8, d16 +; IOS-WITH-STRET-NEXT: vmov.32 r0, d18[0] +; IOS-WITH-STRET-NEXT: vmov.32 r1, d18[1] +; IOS-WITH-STRET-NEXT: vmov.32 r2, d16[0] +; IOS-WITH-STRET-NEXT: vmov.32 r3, d16[1] +; IOS-WITH-STRET-NEXT: add sp, sp, #24 +; IOS-WITH-STRET-NEXT: vpop {d8} +; IOS-WITH-STRET-NEXT: pop {r4, r5, pc} +; +; WATCHABI-LABEL: test_sincos_v2f16: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: vpush {d10} +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: vpush {d8} +; WATCHABI-NEXT: .cfi_def_cfa_offset 24 +; WATCHABI-NEXT: .cfi_offset d10, -16 +; WATCHABI-NEXT: .cfi_offset d8, -24 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 32 +; WATCHABI-NEXT: vmov.f32 s16, s0 +; WATCHABI-NEXT: vcvtb.f32.f16 s0, s1 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: vcvtb.f32.f16 s4, s16 +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vmov.f32 s0, s4 +; WATCHABI-NEXT: vmov.f32 s20, s1 +; WATCHABI-NEXT: strh.w r0, [sp, #6] +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s0 +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s20 +; WATCHABI-NEXT: strh.w r0, [sp, #4] +; WATCHABI-NEXT: add r0, sp, #4 +; WATCHABI-NEXT: vld1.32 {d16[0]}, [r0:32] +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vcvtb.f16.f32 s0, s1 +; WATCHABI-NEXT: strh.w r0, [sp, #2] +; WATCHABI-NEXT: vmov r0, s0 +; WATCHABI-NEXT: vmovl.u16 q0, d16 +; WATCHABI-NEXT: strh.w r0, [sp] +; WATCHABI-NEXT: mov r0, sp +; WATCHABI-NEXT: vld1.32 {d18[0]}, [r0:32] +; WATCHABI-NEXT: vmovl.u16 q1, d18 +; WATCHABI-NEXT: vmov.f32 s2, s4 +; WATCHABI-NEXT: vmov.f32 s3, s5 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: vpop {d8} +; WATCHABI-NEXT: vpop {d10} +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a) ret { <2 x half>, <2 x half> } %result } define { float, float } @test_sincos_f32(float %a) { -; CHECK-LABEL: test_sincos_f32: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: ldrd r1, r0, [sp], #8 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f32: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #8 +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: bl sincosf +; GNU-NEXT: ldrd r1, r0, [sp], #8 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f32: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #8 +; GNUEABI-NEXT: sub sp, sp, #8 +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: ldr r0, [sp, #4] +; GNUEABI-NEXT: ldr r1, [sp], #8 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f32: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, lr} +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: mov r1, r0 +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: pop {r4, r5, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_f32: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #8 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: pop {r0, r1} +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f32: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { float, float } @llvm.sincos.f32(float %a) ret { float, float } %result } define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) { -; CHECK-LABEL: test_sincos_v2f32: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: vmov d8, r0, r1 -; CHECK-NEXT: add r1, sp, #4 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: add r1, sp, #12 -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: bl sincosf -; CHECK-NEXT: vldr s1, [sp, #4] -; CHECK-NEXT: vldr s3, [sp] -; CHECK-NEXT: vldr s0, [sp, #12] -; CHECK-NEXT: vldr s2, [sp, #8] -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: add sp, #16 -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_v2f32: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: vpush {d8} +; GNU-NEXT: sub sp, #16 +; GNU-NEXT: vmov d8, r0, r1 +; GNU-NEXT: add r1, sp, #4 +; GNU-NEXT: mov r2, sp +; GNU-NEXT: vmov r0, s17 +; GNU-NEXT: bl sincosf +; GNU-NEXT: vmov r0, s16 +; GNU-NEXT: add r1, sp, #12 +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: bl sincosf +; GNU-NEXT: vldr s1, [sp, #4] +; GNU-NEXT: vldr s3, [sp] +; GNU-NEXT: vldr s0, [sp, #12] +; GNU-NEXT: vldr s2, [sp, #8] +; GNU-NEXT: vmov r0, r1, d0 +; GNU-NEXT: vmov r2, r3, d1 +; GNU-NEXT: add sp, #16 +; GNU-NEXT: vpop {d8} +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_v2f32: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .vsave {d8} +; GNUEABI-NEXT: vpush {d8} +; GNUEABI-NEXT: .pad #16 +; GNUEABI-NEXT: sub sp, sp, #16 +; GNUEABI-NEXT: vmov d8, r0, r1 +; GNUEABI-NEXT: add r1, sp, #4 +; GNUEABI-NEXT: mov r2, sp +; GNUEABI-NEXT: vmov r0, s17 +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: vmov r0, s16 +; GNUEABI-NEXT: add r1, sp, #12 +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: bl sincosf +; GNUEABI-NEXT: vldr s1, [sp, #4] +; GNUEABI-NEXT: vldr s3, [sp] +; GNUEABI-NEXT: vldr s0, [sp, #12] +; GNUEABI-NEXT: vldr s2, [sp, #8] +; GNUEABI-NEXT: vmov r0, r1, d0 +; GNUEABI-NEXT: vmov r2, r3, d1 +; GNUEABI-NEXT: add sp, sp, #16 +; GNUEABI-NEXT: vpop {d8} +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_v2f32: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, r6, r7, lr} +; IOS-NO-STRET-NEXT: vpush {d8} +; IOS-NO-STRET-NEXT: vmov d8, r0, r1 +; IOS-NO-STRET-NEXT: vmov r4, s17 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: mov r0, r4 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: vmov r6, s16 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: bl _sinf +; IOS-NO-STRET-NEXT: mov r7, r0 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: bl _cosf +; IOS-NO-STRET-NEXT: mov r2, r0 +; IOS-NO-STRET-NEXT: mov r0, r7 +; IOS-NO-STRET-NEXT: mov r1, r5 +; IOS-NO-STRET-NEXT: mov r3, r4 +; IOS-NO-STRET-NEXT: vpop {d8} +; IOS-NO-STRET-NEXT: pop {r4, r5, r6, r7, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_v2f32: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: vpush {d8} +; IOS-WITH-STRET-NEXT: sub sp, sp, #16 +; IOS-WITH-STRET-NEXT: vmov d8, r0, r1 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: vmov r1, s17 +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: vmov r1, s16 +; IOS-WITH-STRET-NEXT: add r0, sp, #8 +; IOS-WITH-STRET-NEXT: bl ___sincosf_stret +; IOS-WITH-STRET-NEXT: vldr s1, [sp] +; IOS-WITH-STRET-NEXT: vldr s3, [sp, #4] +; IOS-WITH-STRET-NEXT: vldr s0, [sp, #8] +; IOS-WITH-STRET-NEXT: vldr s2, [sp, #12] +; IOS-WITH-STRET-NEXT: vmov r0, r1, d0 +; IOS-WITH-STRET-NEXT: vmov r2, r3, d1 +; IOS-WITH-STRET-NEXT: add sp, sp, #16 +; IOS-WITH-STRET-NEXT: vpop {d8} +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_v2f32: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: vpush {d8, d9, d10} +; WATCHABI-NEXT: .cfi_def_cfa_offset 32 +; WATCHABI-NEXT: .cfi_offset d10, -16 +; WATCHABI-NEXT: .cfi_offset d9, -24 +; WATCHABI-NEXT: .cfi_offset d8, -32 +; WATCHABI-NEXT: vmov.f64 d8, d0 +; WATCHABI-NEXT: vmov.f32 s0, s17 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vmov.f32 s19, s0 +; WATCHABI-NEXT: vmov.f32 s0, s16 +; WATCHABI-NEXT: vmov.f32 s21, s1 +; WATCHABI-NEXT: bl ___sincosf_stret +; WATCHABI-NEXT: vmov.f32 s20, s1 +; WATCHABI-NEXT: vmov.f32 s18, s0 +; WATCHABI-NEXT: vmov.f64 d1, d10 +; WATCHABI-NEXT: vmov.f64 d0, d9 +; WATCHABI-NEXT: vpop {d8, d9, d10} +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a) ret { <2 x float>, <2 x float> } %result } define { double, double } @test_sincos_f64(double %a) { -; CHECK-LABEL: test_sincos_f64: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: bl sincos -; CHECK-NEXT: ldrd r0, r1, [sp, #8] -; CHECK-NEXT: ldrd r2, r3, [sp], #16 -; CHECK-NEXT: pop {r7, pc} +; GNU-LABEL: test_sincos_f64: +; GNU: @ %bb.0: +; GNU-NEXT: push {r7, lr} +; GNU-NEXT: sub sp, #16 +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: mov r3, sp +; GNU-NEXT: bl sincos +; GNU-NEXT: ldrd r0, r1, [sp, #8] +; GNU-NEXT: ldrd r2, r3, [sp], #16 +; GNU-NEXT: pop {r7, pc} +; +; GNUEABI-LABEL: test_sincos_f64: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r11, lr} +; GNUEABI-NEXT: push {r11, lr} +; GNUEABI-NEXT: .pad #16 +; GNUEABI-NEXT: sub sp, sp, #16 +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: mov r3, sp +; GNUEABI-NEXT: bl sincos +; GNUEABI-NEXT: ldm sp, {r2, r3} +; GNUEABI-NEXT: ldr r0, [sp, #8] +; GNUEABI-NEXT: ldr r1, [sp, #12] +; GNUEABI-NEXT: add sp, sp, #16 +; GNUEABI-NEXT: pop {r11, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_f64: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, r6, r7, lr} +; IOS-NO-STRET-NEXT: mov r4, r1 +; IOS-NO-STRET-NEXT: mov r5, r0 +; IOS-NO-STRET-NEXT: bl _sin +; IOS-NO-STRET-NEXT: mov r6, r0 +; IOS-NO-STRET-NEXT: mov r7, r1 +; IOS-NO-STRET-NEXT: mov r0, r5 +; IOS-NO-STRET-NEXT: mov r1, r4 +; IOS-NO-STRET-NEXT: bl _cos +; IOS-NO-STRET-NEXT: mov r2, r0 +; IOS-NO-STRET-NEXT: mov r3, r1 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: mov r1, r7 +; IOS-NO-STRET-NEXT: pop {r4, r5, r6, r7, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_f64: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #16 +; IOS-WITH-STRET-NEXT: mov r2, r1 +; IOS-WITH-STRET-NEXT: mov r1, r0 +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: bl ___sincos_stret +; IOS-WITH-STRET-NEXT: vldr d16, [sp, #8] +; IOS-WITH-STRET-NEXT: ldm sp, {r0, r1} +; IOS-WITH-STRET-NEXT: vmov r2, r3, d16 +; IOS-WITH-STRET-NEXT: add sp, sp, #16 +; IOS-WITH-STRET-NEXT: pop {lr} +; IOS-WITH-STRET-NEXT: bx lr +; +; WATCHABI-LABEL: test_sincos_f64: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 16 +; WATCHABI-NEXT: bl ___sincos_stret +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { double, double } @llvm.sincos.f64(double %a) ret { double, double } %result } define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) { -; CHECK-LABEL: test_sincos_v2f64: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: mov r1, r3 -; CHECK-NEXT: mov r12, r2 -; CHECK-NEXT: add r2, sp, #24 -; CHECK-NEXT: add r3, sp, #16 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: bl sincos -; CHECK-NEXT: ldrd r0, r1, [sp, #40] -; CHECK-NEXT: add r2, sp, #8 -; CHECK-NEXT: mov r3, sp -; CHECK-NEXT: bl sincos -; CHECK-NEXT: vldr d19, [sp, #8] -; CHECK-NEXT: vldr d18, [sp, #24] -; CHECK-NEXT: vldr d17, [sp] -; CHECK-NEXT: vldr d16, [sp, #16] -; CHECK-NEXT: vst1.64 {d18, d19}, [r4]! -; CHECK-NEXT: vst1.64 {d16, d17}, [r4] -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: pop {r4, pc} +; GNU-LABEL: test_sincos_v2f64: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, lr} +; GNU-NEXT: sub sp, #32 +; GNU-NEXT: mov r1, r3 +; GNU-NEXT: mov r12, r2 +; GNU-NEXT: add r2, sp, #24 +; GNU-NEXT: add r3, sp, #16 +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: mov r0, r12 +; GNU-NEXT: bl sincos +; GNU-NEXT: ldrd r0, r1, [sp, #40] +; GNU-NEXT: add r2, sp, #8 +; GNU-NEXT: mov r3, sp +; GNU-NEXT: bl sincos +; GNU-NEXT: vldr d19, [sp, #8] +; GNU-NEXT: vldr d18, [sp, #24] +; GNU-NEXT: vldr d17, [sp] +; GNU-NEXT: vldr d16, [sp, #16] +; GNU-NEXT: vst1.64 {d18, d19}, [r4]! +; GNU-NEXT: vst1.64 {d16, d17}, [r4] +; GNU-NEXT: add sp, #32 +; GNU-NEXT: pop {r4, pc} +; +; GNUEABI-LABEL: test_sincos_v2f64: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, lr} +; GNUEABI-NEXT: push {r4, lr} +; GNUEABI-NEXT: .pad #32 +; GNUEABI-NEXT: sub sp, sp, #32 +; GNUEABI-NEXT: mov r1, r3 +; GNUEABI-NEXT: mov r12, r2 +; GNUEABI-NEXT: add r2, sp, #24 +; GNUEABI-NEXT: add r3, sp, #16 +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: mov r0, r12 +; GNUEABI-NEXT: bl sincos +; GNUEABI-NEXT: ldr r0, [sp, #40] +; GNUEABI-NEXT: add r2, sp, #8 +; GNUEABI-NEXT: ldr r1, [sp, #44] +; GNUEABI-NEXT: mov r3, sp +; GNUEABI-NEXT: bl sincos +; GNUEABI-NEXT: vldr d19, [sp, #8] +; GNUEABI-NEXT: vldr d18, [sp, #24] +; GNUEABI-NEXT: vldr d17, [sp] +; GNUEABI-NEXT: vldr d16, [sp, #16] +; GNUEABI-NEXT: vst1.64 {d18, d19}, [r4]! +; GNUEABI-NEXT: vst1.64 {d16, d17}, [r4] +; GNUEABI-NEXT: add sp, sp, #32 +; GNUEABI-NEXT: pop {r4, pc} +; +; IOS-NO-STRET-LABEL: test_sincos_v2f64: +; IOS-NO-STRET: @ %bb.0: +; IOS-NO-STRET-NEXT: push {r4, r5, r6, r7, r8, r10, r11, lr} +; IOS-NO-STRET-NEXT: vpush {d8, d9, d10, d11} +; IOS-NO-STRET-NEXT: ldr r8, [sp, #64] +; IOS-NO-STRET-NEXT: mov r7, r1 +; IOS-NO-STRET-NEXT: mov r4, r0 +; IOS-NO-STRET-NEXT: mov r0, r3 +; IOS-NO-STRET-NEXT: mov r6, r3 +; IOS-NO-STRET-NEXT: mov r10, r2 +; IOS-NO-STRET-NEXT: mov r1, r8 +; IOS-NO-STRET-NEXT: bl _sin +; IOS-NO-STRET-NEXT: mov r11, r0 +; IOS-NO-STRET-NEXT: mov r5, r1 +; IOS-NO-STRET-NEXT: mov r0, r6 +; IOS-NO-STRET-NEXT: mov r1, r8 +; IOS-NO-STRET-NEXT: bl _cos +; IOS-NO-STRET-NEXT: vmov d9, r0, r1 +; IOS-NO-STRET-NEXT: mov r0, r7 +; IOS-NO-STRET-NEXT: mov r1, r10 +; IOS-NO-STRET-NEXT: vmov d11, r11, r5 +; IOS-NO-STRET-NEXT: bl _sin +; IOS-NO-STRET-NEXT: vmov d10, r0, r1 +; IOS-NO-STRET-NEXT: mov r0, r7 +; IOS-NO-STRET-NEXT: mov r1, r10 +; IOS-NO-STRET-NEXT: bl _cos +; IOS-NO-STRET-NEXT: vmov d8, r0, r1 +; IOS-NO-STRET-NEXT: vst1.32 {d10, d11}, [r4]! +; IOS-NO-STRET-NEXT: vst1.32 {d8, d9}, [r4] +; IOS-NO-STRET-NEXT: vpop {d8, d9, d10, d11} +; IOS-NO-STRET-NEXT: pop {r4, r5, r6, r7, r8, r10, r11, pc} +; +; IOS-WITH-STRET-LABEL: test_sincos_v2f64: +; IOS-WITH-STRET: @ %bb.0: +; IOS-WITH-STRET-NEXT: push {r4, r5, r6, lr} +; IOS-WITH-STRET-NEXT: sub sp, sp, #32 +; IOS-WITH-STRET-NEXT: mov r4, r2 +; IOS-WITH-STRET-NEXT: ldr r2, [sp, #48] +; IOS-WITH-STRET-NEXT: mov r6, r0 +; IOS-WITH-STRET-NEXT: add r0, sp, #16 +; IOS-WITH-STRET-NEXT: mov r5, r1 +; IOS-WITH-STRET-NEXT: mov r1, r3 +; IOS-WITH-STRET-NEXT: bl ___sincos_stret +; IOS-WITH-STRET-NEXT: mov r0, sp +; IOS-WITH-STRET-NEXT: mov r1, r5 +; IOS-WITH-STRET-NEXT: mov r2, r4 +; IOS-WITH-STRET-NEXT: bl ___sincos_stret +; IOS-WITH-STRET-NEXT: vldr d17, [sp, #16] +; IOS-WITH-STRET-NEXT: vldr d16, [sp] +; IOS-WITH-STRET-NEXT: vldr d19, [sp, #24] +; IOS-WITH-STRET-NEXT: vldr d18, [sp, #8] +; IOS-WITH-STRET-NEXT: vst1.32 {d16, d17}, [r6]! +; IOS-WITH-STRET-NEXT: vst1.32 {d18, d19}, [r6] +; IOS-WITH-STRET-NEXT: add sp, sp, #32 +; IOS-WITH-STRET-NEXT: pop {r4, r5, r6, pc} +; +; WATCHABI-LABEL: test_sincos_v2f64: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push {r7, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 8 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; WATCHABI-NEXT: .cfi_def_cfa_offset 56 +; WATCHABI-NEXT: .cfi_offset d13, -16 +; WATCHABI-NEXT: .cfi_offset d12, -24 +; WATCHABI-NEXT: .cfi_offset d11, -32 +; WATCHABI-NEXT: .cfi_offset d10, -40 +; WATCHABI-NEXT: .cfi_offset d9, -48 +; WATCHABI-NEXT: .cfi_offset d8, -56 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 64 +; WATCHABI-NEXT: vorr q4, q0, q0 +; WATCHABI-NEXT: vorr d0, d9, d9 +; WATCHABI-NEXT: bl ___sincos_stret +; WATCHABI-NEXT: vorr d11, d0, d0 +; WATCHABI-NEXT: vorr d0, d8, d8 +; WATCHABI-NEXT: vorr d13, d1, d1 +; WATCHABI-NEXT: bl ___sincos_stret +; WATCHABI-NEXT: vorr d12, d1, d1 +; WATCHABI-NEXT: vorr d10, d0, d0 +; WATCHABI-NEXT: vorr q1, q6, q6 +; WATCHABI-NEXT: vorr q0, q5, q5 +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; WATCHABI-NEXT: pop {r7, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a) ret { <2 x double>, <2 x double> } %result } define { fp128, fp128 } @test_sincos_f128(fp128 %a) { -; CHECK-LABEL: test_sincos_f128: -; CHECK: @ %bb.0: -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: mov r12, r3 -; CHECK-NEXT: ldr r3, [sp, #56] -; CHECK-NEXT: add.w lr, sp, #8 -; CHECK-NEXT: mov r4, r0 -; CHECK-NEXT: add r0, sp, #24 -; CHECK-NEXT: strd r0, lr, [sp] -; CHECK-NEXT: mov r0, r1 -; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: mov r2, r12 -; CHECK-NEXT: bl sincosl -; CHECK-NEXT: ldrd r2, r3, [sp, #16] -; CHECK-NEXT: ldrd r12, r1, [sp, #8] -; CHECK-NEXT: str r3, [r4, #28] -; CHECK-NEXT: ldrd r3, r5, [sp, #32] -; CHECK-NEXT: ldrd lr, r0, [sp, #24] -; CHECK-NEXT: strd r1, r2, [r4, #20] -; CHECK-NEXT: add.w r1, r4, #8 -; CHECK-NEXT: stm.w r1, {r3, r5, r12} -; CHECK-NEXT: strd lr, r0, [r4] -; CHECK-NEXT: add sp, #40 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; GNU-LABEL: test_sincos_f128: +; GNU: @ %bb.0: +; GNU-NEXT: push {r4, r5, r7, lr} +; GNU-NEXT: sub sp, #40 +; GNU-NEXT: mov r12, r3 +; GNU-NEXT: ldr r3, [sp, #56] +; GNU-NEXT: add.w lr, sp, #8 +; GNU-NEXT: mov r4, r0 +; GNU-NEXT: add r0, sp, #24 +; GNU-NEXT: strd r0, lr, [sp] +; GNU-NEXT: mov r0, r1 +; GNU-NEXT: mov r1, r2 +; GNU-NEXT: mov r2, r12 +; GNU-NEXT: bl sincosl +; GNU-NEXT: ldrd r2, r3, [sp, #16] +; GNU-NEXT: ldrd r12, r1, [sp, #8] +; GNU-NEXT: str r3, [r4, #28] +; GNU-NEXT: ldrd r3, r5, [sp, #32] +; GNU-NEXT: ldrd lr, r0, [sp, #24] +; GNU-NEXT: strd r1, r2, [r4, #20] +; GNU-NEXT: add.w r1, r4, #8 +; GNU-NEXT: stm.w r1, {r3, r5, r12} +; GNU-NEXT: strd lr, r0, [r4] +; GNU-NEXT: add sp, #40 +; GNU-NEXT: pop {r4, r5, r7, pc} +; +; GNUEABI-LABEL: test_sincos_f128: +; GNUEABI: @ %bb.0: +; GNUEABI-NEXT: .save {r4, r5, r11, lr} +; GNUEABI-NEXT: push {r4, r5, r11, lr} +; GNUEABI-NEXT: .pad #40 +; GNUEABI-NEXT: sub sp, sp, #40 +; GNUEABI-NEXT: mov r12, r3 +; GNUEABI-NEXT: ldr r3, [sp, #56] +; GNUEABI-NEXT: mov r4, r0 +; GNUEABI-NEXT: add r0, sp, #24 +; GNUEABI-NEXT: add r5, sp, #8 +; GNUEABI-NEXT: stm sp, {r0, r5} +; GNUEABI-NEXT: mov r0, r1 +; GNUEABI-NEXT: mov r1, r2 +; GNUEABI-NEXT: mov r2, r12 +; GNUEABI-NEXT: bl sincosl +; GNUEABI-NEXT: add r3, sp, #12 +; GNUEABI-NEXT: ldr r12, [sp, #8] +; GNUEABI-NEXT: ldm r3, {r1, r2, r3} +; GNUEABI-NEXT: str r3, [r4, #28] +; GNUEABI-NEXT: ldr r0, [sp, #32] +; GNUEABI-NEXT: ldr lr, [sp, #24] +; GNUEABI-NEXT: ldr r5, [sp, #28] +; GNUEABI-NEXT: ldr r3, [sp, #36] +; GNUEABI-NEXT: str r2, [r4, #24] +; GNUEABI-NEXT: str r1, [r4, #20] +; GNUEABI-NEXT: add r1, r4, #8 +; GNUEABI-NEXT: stm r1, {r0, r3, r12} +; GNUEABI-NEXT: str r5, [r4, #4] +; GNUEABI-NEXT: str lr, [r4] +; GNUEABI-NEXT: add sp, sp, #40 +; GNUEABI-NEXT: pop {r4, r5, r11, pc} +; +; IOS-LABEL: test_sincos_f128: +; IOS: @ %bb.0: +; IOS-NEXT: push {r4, r5, r6, r7, r8, lr} +; IOS-NEXT: ldr r8, [sp, #24] +; IOS-NEXT: mov r4, r0 +; IOS-NEXT: mov r5, r3 +; IOS-NEXT: mov r6, r2 +; IOS-NEXT: mov r7, r1 +; IOS-NEXT: mov r0, r1 +; IOS-NEXT: mov r1, r2 +; IOS-NEXT: mov r2, r3 +; IOS-NEXT: mov r3, r8 +; IOS-NEXT: bl _cosl +; IOS-NEXT: add r9, r4, #16 +; IOS-NEXT: stm r9, {r0, r1, r2, r3} +; IOS-NEXT: mov r0, r7 +; IOS-NEXT: mov r1, r6 +; IOS-NEXT: mov r2, r5 +; IOS-NEXT: mov r3, r8 +; IOS-NEXT: bl _sinl +; IOS-NEXT: stm r4, {r0, r1, r2, r3} +; IOS-NEXT: pop {r4, r5, r6, r7, r8, pc} +; +; WATCHABI-LABEL: test_sincos_f128: +; WATCHABI: .cfi_startproc +; WATCHABI-NEXT: @ %bb.0: +; WATCHABI-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; WATCHABI-NEXT: .cfi_def_cfa_offset 24 +; WATCHABI-NEXT: .cfi_offset lr, -4 +; WATCHABI-NEXT: .cfi_offset r7, -8 +; WATCHABI-NEXT: .cfi_offset r6, -12 +; WATCHABI-NEXT: .cfi_offset r5, -16 +; WATCHABI-NEXT: .cfi_offset r4, -20 +; WATCHABI-NEXT: .cfi_offset r8, -24 +; WATCHABI-NEXT: sub sp, #8 +; WATCHABI-NEXT: .cfi_def_cfa_offset 32 +; WATCHABI-NEXT: ldr.w r8, [sp, #32] +; WATCHABI-NEXT: mov r4, r0 +; WATCHABI-NEXT: mov r5, r3 +; WATCHABI-NEXT: mov r6, r2 +; WATCHABI-NEXT: mov r7, r1 +; WATCHABI-NEXT: mov r0, r1 +; WATCHABI-NEXT: mov r1, r2 +; WATCHABI-NEXT: mov r2, r3 +; WATCHABI-NEXT: mov r3, r8 +; WATCHABI-NEXT: bl _cosl +; WATCHABI-NEXT: add.w r9, r4, #16 +; WATCHABI-NEXT: stm.w r9, {r0, r1, r2, r3} +; WATCHABI-NEXT: mov r0, r7 +; WATCHABI-NEXT: mov r1, r6 +; WATCHABI-NEXT: mov r2, r5 +; WATCHABI-NEXT: mov r3, r8 +; WATCHABI-NEXT: bl _sinl +; WATCHABI-NEXT: stm r4!, {r0, r1, r2, r3} +; WATCHABI-NEXT: add sp, #8 +; WATCHABI-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; WATCHABI-NEXT: .cfi_endproc %result = call { fp128, fp128 } @llvm.sincos.f16(fp128 %a) ret { fp128, fp128 } %result } diff --git a/llvm/test/CodeGen/BPF/bpf_trap.ll b/llvm/test/CodeGen/BPF/bpf_trap.ll new file mode 100644 index 0000000000000..ab8df5ff7cb0d --- /dev/null +++ b/llvm/test/CodeGen/BPF/bpf_trap.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s | FileCheck %s +; +target triple = "bpf" + +define i32 @test(i8 %x) { +entry: + %0 = and i8 %x, 3 + switch i8 %0, label %default.unreachable4 [ + i8 0, label %return + i8 1, label %sw.bb1 + i8 2, label %sw.bb2 + i8 3, label %sw.bb3 + ] + +sw.bb1: ; preds = %entry + br label %return + +sw.bb2: ; preds = %entry + br label %return + +sw.bb3: ; preds = %entry + br label %return + +default.unreachable4: ; preds = %entry + unreachable + +return: ; preds = %entry, %sw.bb3, %sw.bb2, %sw.bb1 + %retval.0 = phi i32 [ 12, %sw.bb1 ], [ 43, %sw.bb2 ], [ 54, %sw.bb3 ], [ 32, %entry ] + ret i32 %retval.0 +} + +; CHECK-NOT: __bpf_trap diff --git a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll index d5a1d63b644a8..b7d518639d70e 100644 --- a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll +++ b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll @@ -84,8 +84,8 @@ llc -march=bpf -mcpu=v4 < test.ll \ ; CHECK: .cfi_endproc ; CHECK: .section .jumptables,"",@progbits ; CHECK: BPF.JT.0.0: -; CHECK: .quad LBB0_3 +; CHECK: .quad LBB0_3-.text ; CHECK: .size BPF.JT.0.0, 8 ; CHECK: BPF.JT.0.1: -; CHECK: .quad LBB0_4 +; CHECK: .quad LBB0_4-.text ; CHECK: .size BPF.JT.0.1, 8 diff --git a/llvm/test/CodeGen/BPF/jump_table_global_var.ll b/llvm/test/CodeGen/BPF/jump_table_global_var.ll index bbca46850843b..71c682f5530ed 100644 --- a/llvm/test/CodeGen/BPF/jump_table_global_var.ll +++ b/llvm/test/CodeGen/BPF/jump_table_global_var.ll @@ -78,6 +78,6 @@ llc -march=bpf -mcpu=v4 < test.ll \ ; CHECK: .cfi_endproc ; CHECK: .section .jumptables,"",@progbits ; CHECK: BPF.JT.0.0: -; CHECK: .quad LBB0_1 -; CHECK: .quad LBB0_2 +; CHECK: .quad LBB0_1-.text +; CHECK: .quad LBB0_2-.text ; CHECK: .size BPF.JT.0.0, 16 diff --git a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll index 682b025d665d6..eb1e5bff11013 100644 --- a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll +++ b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll @@ -93,34 +93,34 @@ llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll \ ; CHECK: .cfi_endproc ; CHECK: .section .jumptables,"",@progbits ; CHECK: BPF.JT.0.0: -; CHECK: .quad LBB0_4 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_2 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_5 -; CHECK: .quad LBB0_3 +; CHECK: .quad LBB0_4-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_2-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_5-.text +; CHECK: .quad LBB0_3-.text ; CHECK: .size BPF.JT.0.0, 240 diff --git a/llvm/test/CodeGen/DirectX/f16tof32.ll b/llvm/test/CodeGen/DirectX/f16tof32.ll new file mode 100644 index 0000000000000..edc5c1942e8bd --- /dev/null +++ b/llvm/test/CodeGen/DirectX/f16tof32.ll @@ -0,0 +1,57 @@ +; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.9-library %s | FileCheck %s + +define hidden noundef nofpclass(nan inf) float @_Z11test_scalarj(i32 noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK : [[UINT:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 %p0) + ; CHECK : ret float [[UINT]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %p0) + ret float %hlsl.f16tof32 +} + +define hidden noundef nofpclass(nan inf) <2 x float> @_Z10test_uint2Dv2_j(<2 x i32> noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK: [[UINT2_0:%.*]] = extractelement <2 x i32> %p0, i64 0 + ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_0]]) + ; CHECK: [[UINT2_1:%.*]] = extractelement <2 x i32> %p0, i64 1 + ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_1]]) + ; CHECK: [[FLOAT2_0:%.*]] = insertelement <2 x float> poison, float [[FLOAT_0]], i64 0 + ; CHECK: [[FLOAT2_1:%.*]] = insertelement <2 x float> [[FLOAT2_0]], float [[FLOAT_1]], i64 1 + ; CHECK : ret <2 x float> [[FLOAT2_1]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %p0) + ret <2 x float> %hlsl.f16tof32 +} + +define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK: [[UINT3_0:%.*]] = extractelement <3 x i32> %p0, i64 0 + ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_0]]) + ; CHECK: [[UINT3_1:%.*]] = extractelement <3 x i32> %p0, i64 1 + ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_1]]) + ; CHECK: [[UINT3_2:%.*]] = extractelement <3 x i32> %p0, i64 2 + ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_2]]) + ; CHECK: [[FLOAT3_0:%.*]] = insertelement <3 x float> poison, float [[FLOAT_0]], i64 0 + ; CHECK: [[FLOAT3_1:%.*]] = insertelement <3 x float> [[FLOAT3_0]], float [[FLOAT_1]], i64 1 + ; CHECK: [[FLOAT3_2:%.*]] = insertelement <3 x float> [[FLOAT3_1]], float [[FLOAT_2]], i64 2 + ; CHECK : ret <3 x float> [[FLOAT3_2]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %p0) + ret <3 x float> %hlsl.f16tof32 +} + +define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) local_unnamed_addr #0 { +entry: + ; CHECK: [[UINT4_0:%.*]] = extractelement <4 x i32> %p0, i64 0 + ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_0]]) + ; CHECK: [[UINT4_1:%.*]] = extractelement <4 x i32> %p0, i64 1 + ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_1]]) + ; CHECK: [[UINT4_2:%.*]] = extractelement <4 x i32> %p0, i64 2 + ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_2]]) + ; CHECK: [[UINT4_3:%.*]] = extractelement <4 x i32> %p0, i64 3 + ; CHECK: [[FLOAT_3:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_3]]) + ; CHECK: [[FLOAT4_0:%.*]] = insertelement <4 x float> poison, float [[FLOAT_0]], i64 0 + ; CHECK: [[FLOAT4_1:%.*]] = insertelement <4 x float> [[FLOAT4_0]], float [[FLOAT_1]], i64 1 + ; CHECK: [[FLOAT4_2:%.*]] = insertelement <4 x float> [[FLOAT4_1]], float [[FLOAT_2]], i64 2 + ; CHECK: [[FLOAT4_3:%.*]] = insertelement <4 x float> [[FLOAT4_2]], float [[FLOAT_3]], i64 3 + ; CHECK : ret <4 x float> [[FLOAT4_3]] + %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %p0) + ret <4 x float> %hlsl.f16tof32 +} diff --git a/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll new file mode 100644 index 0000000000000..67d2ad72ee2f4 --- /dev/null +++ b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll @@ -0,0 +1,50 @@ +; REQUIRES: x86-registered-target + +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -O0 -mfs-psi-cutoff=0 -mfs-count-threshold=10000 | FileCheck %s + +;; Check that functions with optnone attribute are not split. +; CHECK-LABEL: foo_optnone: +; CHECK-NOT: .section .text.split.foo_optnone +; CHECK-NOT: foo_optnone.cold: +; CHECK: .LBB0_2: +; CHECK: .size foo_optnone + +define void @foo_optnone(i1 zeroext %0) nounwind optnone noinline !prof !14 !section_prefix !15 { +entry: + br i1 %0, label %hot, label %cold, !prof !17 + +hot: + %1 = call i32 @bar() + br label %exit + +cold: + %2 = call i32 @baz() + br label %exit + +exit: + %3 = tail call i32 @qux() + ret void +} + +declare i32 @bar() +declare i32 @baz() +declare i32 @qux() + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 5} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999900, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 7000} +!15 = !{!"function_section_prefix", !"hot"} +!17 = !{!"branch_weights", i32 7000, i32 0} diff --git a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll index 559bb68741e12..930cf8152b756 100644 --- a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll +++ b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll @@ -6,11 +6,11 @@ target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" target triple = "hexagon" -define i32 @fred(ptr %a0) #0 { +define i32 @fred(ptr %a0, i32 %cond) #0 { ; CHECK-LABEL: fred: ; CHECK: // %bb.0: // %b0 ; CHECK-NEXT: { -; CHECK-NEXT: if (p0) jump:nt .LBB0_2 +; CHECK-NEXT: p0 = cmp.eq(r1,#5); if (!p0.new) jump:t .LBB0_2 ; CHECK-NEXT: } ; CHECK-NEXT: // %bb.1: // %b2 ; CHECK-NEXT: { @@ -40,7 +40,7 @@ define i32 @fred(ptr %a0) #0 { ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } b0: - switch i32 undef, label %b14 [ + switch i32 %cond, label %b14 [ i32 5, label %b2 i32 3, label %b1 ] diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll new file mode 100644 index 0000000000000..006713ccabf47 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll @@ -0,0 +1,303 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>) + +define void @lasx_cast_128_s(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_cast_128_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x float>, ptr %va + %b = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %a) + store <8 x float> %b, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>) + +define void @lasx_cast_128_d(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_cast_128_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x double>, ptr %va + %b = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %a) + store <4 x double> %b, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>) + +define void @lasx_cast_128(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_cast_128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x i64>, ptr %va + %b = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %a) + store <4 x i64> %b, ptr %vd + ret void +} + +declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>) + +define void @lasx_concat_128_s(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_concat_128_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>) + +define void @lasx_concat_128_d(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_concat_128_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>) + +define void @lasx_concat_128(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_concat_128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void +} + +declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>) + +define void @lasx_extract_128_lo_s(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_lo_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %c = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %a) + store <4 x float> %c, ptr %vd + ret void +} + +declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>) + +define void @lasx_extract_128_lo_d(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_lo_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %c = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %a) + store <2 x double> %c, ptr %vd + ret void +} + +declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>) + +define void @lasx_extract_128_lo(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_lo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %a) + store <2 x i64> %c, ptr %vd + ret void +} + +declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>) + +define void @lasx_extract_128_hi_s(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_hi_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %c = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %a) + store <4 x float> %c, ptr %vd + ret void +} + +declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>) + +define void @lasx_extract_128_hi_d(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_hi_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %c = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %a) + store <2 x double> %c, ptr %vd + ret void +} + +declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>) + +define void @lasx_extract_128_hi(ptr %vd, ptr %va) { +; CHECK-LABEL: lasx_extract_128_hi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %a) + store <2 x i64> %c, ptr %vd + ret void +} + +declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>) + +define void @lasx_insert_128_lo_s(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_lo_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>) + +define void @lasx_insert_128_lo_d(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_lo_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>) + +define void @lasx_insert_128_lo(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_lo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void +} + +declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>) + +define void @lasx_insert_128_hi_s(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_hi_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void +} + +declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>) + +define void @lasx_insert_128_hi_d(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_hi_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void +} + +declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>) + +define void @lasx_insert_128_hi(ptr %vd, ptr %va, ptr %vb) { +; CHECK-LABEL: lasx_insert_128_hi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index 4d930cd9e57c0..3626613cf8511 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM90-FTZ %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %} ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %} @@ -55,13 +56,24 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fadd( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1]; +; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fadd( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -118,13 +130,24 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %r3; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fsub( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1]; +; SM90-FTZ-NEXT: sub.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fsub( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -195,16 +218,27 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_faddx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_faddx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_faddx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1]; +; SM90-FTZ-NEXT: add.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_faddx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -275,16 +309,27 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsubx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsubx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fsubx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1]; +; SM90-FTZ-NEXT: sub.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fsubx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -355,16 +400,27 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmulx2_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmulx2_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: mul.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fmulx2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1]; +; SM90-FTZ-NEXT: mul.rn.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fmulx2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -441,16 +497,34 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0]; ; SM80-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r2, %rs1; ; SM80-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r4, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs2; ; SM80-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4; ; SM80-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r7; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fdiv( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<5>; +; SM90-FTZ-NEXT: .reg .b32 %r<8>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0]; +; SM90-FTZ-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1]; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs3; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs1; +; SM90-FTZ-NEXT: div.rn.ftz.f32 %r3, %r2, %r1; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r4, %rs4; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM90-FTZ-NEXT: div.rn.ftz.f32 %r6, %r5, %r4; +; SM90-FTZ-NEXT: cvt.rn.bf16x2.f32 %r7, %r6, %r3; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r7; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fdiv( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<5>; @@ -527,10 +601,21 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fpext_float( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fpext_float( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -585,6 +670,17 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptrunc_float( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_fptrunc_float_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptrunc_float( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -637,12 +733,23 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: add.rn.ftz.f32 %r2, %r1, 0f3F800000; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fadd_imm_1( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0]; +; SM90-FTZ-NEXT: mov.b16 %rs2, 0x3F80; +; SM90-FTZ-NEXT: add.rn.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fadd_imm_1( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -750,18 +857,43 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4; ; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1; ; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r5, %rs8; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r6, %rs7; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r7, %rs6; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r8, %rs5; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r9, %rs4; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r10, %rs3; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r11, %rs2; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r12, %rs1; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_extload_bf16x8( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<9>; +; SM90-FTZ-NEXT: .reg .b32 %r<13>; +; SM90-FTZ-NEXT: .reg .b64 %rd<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; +; SM90-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; SM90-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4; +; SM90-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; SM90-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; +; SM90-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; +; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; +; SM90-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_extload_bf16x8( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<9>; @@ -825,12 +957,24 @@ define i16 @test_fptosi_i16(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rzi.ftz.s16.f32 %rs2, %r1; ; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptosi_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rzi.s16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptosi_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -880,12 +1024,24 @@ define i16 @test_fptoui_i16(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rzi.ftz.u16.f32 %rs2, %r1; ; SM80-FTZ-NEXT: cvt.u32.u16 %r2, %rs2; ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_fptoui_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rzi.u16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: cvt.u32.u16 %r1, %rs2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_fptoui_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -945,6 +1101,16 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_sitofp_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.s16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_sitofp_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1002,6 +1168,16 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i8( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i8( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1070,6 +1246,21 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i1( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .pred %p<2>; +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0]; +; SM90-FTZ-NEXT: and.b16 %rs2, %rs1, 1; +; SM90-FTZ-NEXT: setp.ne.b16 %p1, %rs2, 0; +; SM90-FTZ-NEXT: selp.b32 %r1, 1, 0, %p1; +; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs3, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i1( ; SM90: { ; SM90-NEXT: .reg .pred %p<2>; @@ -1132,6 +1323,16 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i16( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i16( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1188,6 +1389,17 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i32( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b32 %r<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u32 %rs1, %r1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i32( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1248,6 +1460,17 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_uitofp_i64( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<2>; +; SM90-FTZ-NEXT: .reg .b64 %rd<2>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0]; +; SM90-FTZ-NEXT: cvt.rn.bf16.u64 %rs1, %rd1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_uitofp_i64( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<2>; @@ -1302,12 +1525,22 @@ define bfloat @test_roundeven(bfloat %a) { ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1; +; SM80-FTZ-NEXT: cvt.f32.bf16 %r1, %rs1; ; SM80-FTZ-NEXT: cvt.rni.ftz.f32.f32 %r2, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %r2; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_roundeven( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<3>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0]; +; SM90-FTZ-NEXT: cvt.rni.bf16.bf16 %rs2, %rs1; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_roundeven( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<3>; @@ -1372,6 +1605,17 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maximum( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maximum_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; +; SM90-FTZ-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maximum( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -1430,6 +1674,17 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) { ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maxnum( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b16 %rs<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0]; +; SM90-FTZ-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1]; +; SM90-FTZ-NEXT: max.bf16 %rs3, %rs1, %rs2; +; SM90-FTZ-NEXT: st.param.b16 [func_retval0], %rs3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maxnum( ; SM90: { ; SM90-NEXT: .reg .b16 %rs<4>; @@ -1511,6 +1766,17 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maximum_v2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_1]; +; SM90-FTZ-NEXT: max.NaN.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maximum_v2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; @@ -1583,6 +1849,17 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3; ; SM80-FTZ-NEXT: ret; ; +; SM90-FTZ-LABEL: test_maxnum_v2( +; SM90-FTZ: { +; SM90-FTZ-NEXT: .reg .b32 %r<4>; +; SM90-FTZ-EMPTY: +; SM90-FTZ-NEXT: // %bb.0: +; SM90-FTZ-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_0]; +; SM90-FTZ-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_1]; +; SM90-FTZ-NEXT: max.bf16x2 %r3, %r1, %r2; +; SM90-FTZ-NEXT: st.param.b32 [func_retval0], %r3; +; SM90-FTZ-NEXT: ret; +; ; SM90-LABEL: test_maxnum_v2( ; SM90: { ; SM90-NEXT: .reg .b32 %r<4>; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll new file mode 100644 index 0000000000000..4d81fdc67736d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll @@ -0,0 +1,11 @@ +; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s -o /dev/null 2>&1 | FileCheck %s + +; Test that we get a clear error message when using an unsupported syncscope. + +; CHECK: NVPTX backend does not support syncscope "agent" +; CHECK: Supported syncscopes are: singlethread, <empty string>, block, cluster, device +define i32 @cmpxchg_unsupported_syncscope_agent(ptr %addr, i32 %cmp, i32 %new) { + %result = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("agent") monotonic monotonic + %value = extractvalue { i32, i1 } %result, 0 + ret i32 %value +} diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll index 7e2f744ac1d71..94121f09e36be 100644 --- a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c) local_unnamed_addr #0 { ; CHECK-LABEL: testMultiply: @@ -91,6 +97,91 @@ define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture nound ; CHECK-BE-NEXT: ld r30, -16(r1) ; CHECK-BE-NEXT: mtlr r0 ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: testMultiply: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: mflr r0 +; CHECK-LE-WACC-NEXT: std r30, -16(r1) +; CHECK-LE-WACC-NEXT: std r0, 16(r1) +; CHECK-LE-WACC-NEXT: clrldi r0, r1, 59 +; CHECK-LE-WACC-NEXT: subfic r0, r0, -128 +; CHECK-LE-WACC-NEXT: mr r30, r1 +; CHECK-LE-WACC-NEXT: stdux r1, r1, r0 +; CHECK-LE-WACC-NEXT: stxv v30, -64(r30) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v31, -48(r30) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: lxv v31, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v30, 0(r4) +; CHECK-LE-WACC-NEXT: addi r3, r1, 32 +; CHECK-LE-WACC-NEXT: std r29, -24(r30) # 8-byte Folded Spill +; CHECK-LE-WACC-NEXT: vmr v2, v31 +; CHECK-LE-WACC-NEXT: vmr v3, v30 +; CHECK-LE-WACC-NEXT: mr r29, r5 +; CHECK-LE-WACC-NEXT: bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_@notoc +; CHECK-LE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-LE-WACC-NEXT: xvf32gerpp wacc0, v31, v30 +; CHECK-LE-WACC-NEXT: lxv vs0, 48(r1) +; CHECK-LE-WACC-NEXT: lxv vs1, 32(r1) +; CHECK-LE-WACC-NEXT: xvf32gerpp wacc0, vs1, vs0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v5, 0(r29) +; CHECK-LE-WACC-NEXT: pstxv v4, 8(r29), 0 +; CHECK-LE-WACC-NEXT: stxv v3, 16(r29) +; CHECK-LE-WACC-NEXT: pstxv v2, 24(r29), 0 +; CHECK-LE-WACC-NEXT: lxv v31, -48(r30) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v30, -64(r30) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: ld r29, -24(r30) # 8-byte Folded Reload +; CHECK-LE-WACC-NEXT: mr r1, r30 +; CHECK-LE-WACC-NEXT: ld r0, 16(r1) +; CHECK-LE-WACC-NEXT: ld r30, -16(r1) +; CHECK-LE-WACC-NEXT: mtlr r0 +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testMultiply: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: mflr r0 +; CHECK-BE-WACC-NEXT: std r30, -16(r1) +; CHECK-BE-WACC-NEXT: std r0, 16(r1) +; CHECK-BE-WACC-NEXT: clrldi r0, r1, 59 +; CHECK-BE-WACC-NEXT: subfic r0, r0, -224 +; CHECK-BE-WACC-NEXT: mr r30, r1 +; CHECK-BE-WACC-NEXT: stdux r1, r1, r0 +; CHECK-BE-WACC-NEXT: stxv v30, -64(r30) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v31, -48(r30) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: lxv v31, 0(r3) +; CHECK-BE-WACC-NEXT: lxv v30, 0(r4) +; CHECK-BE-WACC-NEXT: addi r3, r1, 128 +; CHECK-BE-WACC-NEXT: std r29, -24(r30) # 8-byte Folded Spill +; CHECK-BE-WACC-NEXT: vmr v2, v31 +; CHECK-BE-WACC-NEXT: vmr v3, v30 +; CHECK-BE-WACC-NEXT: mr r29, r5 +; CHECK-BE-WACC-NEXT: bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_ +; CHECK-BE-WACC-NEXT: nop +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v31, v30 +; CHECK-BE-WACC-NEXT: lxv vs0, 128(r1) +; CHECK-BE-WACC-NEXT: lxv vs1, 144(r1) +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, vs0, vs1 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: vmr v1, v2 +; CHECK-BE-WACC-NEXT: vmr v7, v4 +; CHECK-BE-WACC-NEXT: vmr v0, v3 +; CHECK-BE-WACC-NEXT: vmr v6, v5 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp38, vsp32, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r29) +; CHECK-BE-WACC-NEXT: pstxv v3, 8(r29), 0 +; CHECK-BE-WACC-NEXT: stxv v4, 16(r29) +; CHECK-BE-WACC-NEXT: pstxv v5, 24(r29), 0 +; CHECK-BE-WACC-NEXT: lxv v31, -48(r30) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v30, -64(r30) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: ld r29, -24(r30) # 8-byte Folded Reload +; CHECK-BE-WACC-NEXT: mr r1, r30 +; CHECK-BE-WACC-NEXT: ld r0, 16(r1) +; CHECK-BE-WACC-NEXT: ld r30, -16(r1) +; CHECK-BE-WACC-NEXT: mtlr r0 +; CHECK-BE-WACC-NEXT: blr entry: %vP = alloca <256 x i1>, align 32 call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %vP) diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll index 059d60a9608f8..bc5d5bed36e9b 100644 --- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll @@ -3,10 +3,18 @@ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ ; RUN: -disable-auto-paired-vec-st=false < %s | FileCheck %s \ ; RUN: --check-prefix=LE-PAIRED +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -disable-auto-paired-vec-st=false < %s | FileCheck %s \ +; RUN: --check-prefix=LE-PAIRED-WACC ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \ ; RUN: FileCheck %s --check-prefix=BE-PAIRED +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \ +; RUN: FileCheck %s --check-prefix=BE-PAIRED-WACC ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \ ; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \ ; RUN: | FileCheck %s --check-prefix=LE-PWR9 @@ -36,6 +44,20 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: pstxv vs3, f@PCREL+128(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testLdSt: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv v3, f@PCREL+64(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v5, f@PCREL+96(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v2, f@PCREL+80(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v4, f@PCREL+112(0), 1 +; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; LE-PAIRED-WACC-NEXT: pstxv v4, f@PCREL+176(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v5, f@PCREL+160(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v2, f@PCREL+144(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v3, f@PCREL+128(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testLdSt: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, f@toc@ha @@ -50,6 +72,22 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs2, 160(r3) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testLdSt: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, f@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, f@toc@l +; BE-PAIRED-WACC-NEXT: lxv v3, 112(r3) +; BE-PAIRED-WACC-NEXT: lxv v5, 80(r3) +; BE-PAIRED-WACC-NEXT: lxv v2, 96(r3) +; BE-PAIRED-WACC-NEXT: lxv v4, 64(r3) +; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; BE-PAIRED-WACC-NEXT: stxv v5, 176(r3) +; BE-PAIRED-WACC-NEXT: stxv v4, 160(r3) +; BE-PAIRED-WACC-NEXT: stxv v3, 144(r3) +; BE-PAIRED-WACC-NEXT: stxv v2, 128(r3) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testLdSt: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, f@toc@ha @@ -147,6 +185,25 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: stxv vs2, 16(r4) ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testXLdSt: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: paddi r5, 0, f@PCREL, 1 +; LE-PAIRED-WACC-NEXT: sldi r3, r3, 6 +; LE-PAIRED-WACC-NEXT: add r6, r5, r3 +; LE-PAIRED-WACC-NEXT: lxvx v3, r5, r3 +; LE-PAIRED-WACC-NEXT: lxv v2, 16(r6) +; LE-PAIRED-WACC-NEXT: lxv v5, 32(r6) +; LE-PAIRED-WACC-NEXT: lxv v4, 48(r6) +; LE-PAIRED-WACC-NEXT: sldi r3, r4, 6 +; LE-PAIRED-WACC-NEXT: add r4, r5, r3 +; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; LE-PAIRED-WACC-NEXT: stxvx v3, r5, r3 +; LE-PAIRED-WACC-NEXT: stxv v4, 48(r4) +; LE-PAIRED-WACC-NEXT: stxv v5, 32(r4) +; LE-PAIRED-WACC-NEXT: stxv v2, 16(r4) +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testXLdSt: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r5, r2, f@toc@ha @@ -165,6 +222,26 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs2, 32(r4) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testXLdSt: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r5, r2, f@toc@ha +; BE-PAIRED-WACC-NEXT: addi r5, r5, f@toc@l +; BE-PAIRED-WACC-NEXT: sldi r3, r3, 6 +; BE-PAIRED-WACC-NEXT: add r6, r5, r3 +; BE-PAIRED-WACC-NEXT: lxvx v2, r5, r3 +; BE-PAIRED-WACC-NEXT: lxv v5, 48(r6) +; BE-PAIRED-WACC-NEXT: lxv v3, 16(r6) +; BE-PAIRED-WACC-NEXT: lxv v4, 32(r6) +; BE-PAIRED-WACC-NEXT: sldi r3, r4, 6 +; BE-PAIRED-WACC-NEXT: add r4, r5, r3 +; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; BE-PAIRED-WACC-NEXT: stxvx v2, r5, r3 +; BE-PAIRED-WACC-NEXT: stxv v5, 48(r4) +; BE-PAIRED-WACC-NEXT: stxv v4, 32(r4) +; BE-PAIRED-WACC-NEXT: stxv v3, 16(r4) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testXLdSt: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r5, r2, f@toc@ha @@ -263,6 +340,20 @@ define dso_local void @testUnalignedLdSt() { ; LE-PAIRED-NEXT: pstxv vs3, f@PCREL+19(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testUnalignedLdSt: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv v3, f@PCREL+11(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v5, f@PCREL+43(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v2, f@PCREL+27(0), 1 +; LE-PAIRED-WACC-NEXT: plxv v4, f@PCREL+59(0), 1 +; LE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; LE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; LE-PAIRED-WACC-NEXT: pstxv v4, f@PCREL+67(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v5, f@PCREL+51(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v2, f@PCREL+35(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv v3, f@PCREL+19(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testUnalignedLdSt: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, f@toc@ha @@ -277,6 +368,22 @@ define dso_local void @testUnalignedLdSt() { ; BE-PAIRED-NEXT: pstxv vs2, 51(r3), 0 ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testUnalignedLdSt: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, f@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, f@toc@l +; BE-PAIRED-WACC-NEXT: plxv v3, 59(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv v5, 27(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv v2, 43(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv v4, 11(r3), 0 +; BE-PAIRED-WACC-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 +; BE-PAIRED-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; BE-PAIRED-WACC-NEXT: pstxv v5, 67(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv v4, 51(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv v3, 35(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv v2, 19(r3), 0 +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testUnalignedLdSt: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, f@toc@ha @@ -381,6 +488,14 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: pstxv vs1, g@PCREL+64(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testLdStPair: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv vs0, g@PCREL+48(0), 1 +; LE-PAIRED-WACC-NEXT: plxv vs1, g@PCREL+32(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs0, g@PCREL+80(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs1, g@PCREL+64(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testLdStPair: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, g@toc@ha @@ -391,6 +506,16 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs0, 64(r3) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testLdStPair: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, g@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, g@toc@l +; BE-PAIRED-WACC-NEXT: lxv vs0, 48(r3) +; BE-PAIRED-WACC-NEXT: lxv vs1, 32(r3) +; BE-PAIRED-WACC-NEXT: stxv vs0, 80(r3) +; BE-PAIRED-WACC-NEXT: stxv vs1, 64(r3) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testLdStPair: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, g@toc@ha @@ -460,6 +585,19 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; LE-PAIRED-NEXT: stxv vs1, 16(r4) ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testXLdStPair: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: sldi r3, r3, 5 +; LE-PAIRED-WACC-NEXT: paddi r5, 0, g@PCREL, 1 +; LE-PAIRED-WACC-NEXT: add r6, r5, r3 +; LE-PAIRED-WACC-NEXT: lxvx vs0, r5, r3 +; LE-PAIRED-WACC-NEXT: lxv vs1, 16(r6) +; LE-PAIRED-WACC-NEXT: sldi r3, r4, 5 +; LE-PAIRED-WACC-NEXT: add r4, r5, r3 +; LE-PAIRED-WACC-NEXT: stxvx vs0, r5, r3 +; LE-PAIRED-WACC-NEXT: stxv vs1, 16(r4) +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testXLdStPair: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r5, r2, g@toc@ha @@ -474,6 +612,20 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) { ; BE-PAIRED-NEXT: stxv vs1, 16(r4) ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testXLdStPair: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r5, r2, g@toc@ha +; BE-PAIRED-WACC-NEXT: sldi r3, r3, 5 +; BE-PAIRED-WACC-NEXT: addi r5, r5, g@toc@l +; BE-PAIRED-WACC-NEXT: add r6, r5, r3 +; BE-PAIRED-WACC-NEXT: lxvx vs0, r5, r3 +; BE-PAIRED-WACC-NEXT: lxv vs1, 16(r6) +; BE-PAIRED-WACC-NEXT: sldi r3, r4, 5 +; BE-PAIRED-WACC-NEXT: add r4, r5, r3 +; BE-PAIRED-WACC-NEXT: stxvx vs0, r5, r3 +; BE-PAIRED-WACC-NEXT: stxv vs1, 16(r4) +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testXLdStPair: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r5, r2, g@toc@ha @@ -548,6 +700,14 @@ define dso_local void @testUnalignedLdStPair() { ; LE-PAIRED-NEXT: pstxv vs1, g@PCREL+19(0), 1 ; LE-PAIRED-NEXT: blr ; +; LE-PAIRED-WACC-LABEL: testUnalignedLdStPair: +; LE-PAIRED-WACC: # %bb.0: # %entry +; LE-PAIRED-WACC-NEXT: plxv vs0, g@PCREL+27(0), 1 +; LE-PAIRED-WACC-NEXT: plxv vs1, g@PCREL+11(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs0, g@PCREL+35(0), 1 +; LE-PAIRED-WACC-NEXT: pstxv vs1, g@PCREL+19(0), 1 +; LE-PAIRED-WACC-NEXT: blr +; ; BE-PAIRED-LABEL: testUnalignedLdStPair: ; BE-PAIRED: # %bb.0: # %entry ; BE-PAIRED-NEXT: addis r3, r2, g@toc@ha @@ -558,6 +718,16 @@ define dso_local void @testUnalignedLdStPair() { ; BE-PAIRED-NEXT: pstxv vs0, 19(r3), 0 ; BE-PAIRED-NEXT: blr ; +; BE-PAIRED-WACC-LABEL: testUnalignedLdStPair: +; BE-PAIRED-WACC: # %bb.0: # %entry +; BE-PAIRED-WACC-NEXT: addis r3, r2, g@toc@ha +; BE-PAIRED-WACC-NEXT: addi r3, r3, g@toc@l +; BE-PAIRED-WACC-NEXT: plxv vs0, 27(r3), 0 +; BE-PAIRED-WACC-NEXT: plxv vs1, 11(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv vs0, 35(r3), 0 +; BE-PAIRED-WACC-NEXT: pstxv vs1, 19(r3), 0 +; BE-PAIRED-WACC-NEXT: blr +; ; LE-PWR9-LABEL: testUnalignedLdStPair: ; LE-PWR9: # %bb.0: # %entry ; LE-PWR9-NEXT: addis r3, r2, g@toc@ha diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll index abc65bed5bf6c..9db8ba1c9eb09 100644 --- a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll +++ b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll @@ -13,6 +13,13 @@ ; RUN: -mcpu=pwr11 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC + declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>) declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) declare void @foo() @@ -119,6 +126,101 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-BE-NEXT: ld r0, 16(r1) ; CHECK-BE-NEXT: mtlr r0 ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: intrinsics1: +; CHECK-LE-WACC: # %bb.0: +; CHECK-LE-WACC-NEXT: mflr r0 +; CHECK-LE-WACC-NEXT: std r0, 16(r1) +; CHECK-LE-WACC-NEXT: stdu r1, -176(r1) +; CHECK-LE-WACC-NEXT: .cfi_def_cfa_offset 176 +; CHECK-LE-WACC-NEXT: .cfi_offset lr, 16 +; CHECK-LE-WACC-NEXT: .cfi_offset r30, -16 +; CHECK-LE-WACC-NEXT: .cfi_offset v28, -80 +; CHECK-LE-WACC-NEXT: .cfi_offset v29, -64 +; CHECK-LE-WACC-NEXT: .cfi_offset v30, -48 +; CHECK-LE-WACC-NEXT: .cfi_offset v31, -32 +; CHECK-LE-WACC-NEXT: stxv v28, 96(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v29, 112(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v30, 128(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: stxv v31, 144(r1) # 16-byte Folded Spill +; CHECK-LE-WACC-NEXT: vmr v31, v5 +; CHECK-LE-WACC-NEXT: vmr v29, v3 +; CHECK-LE-WACC-NEXT: vmr v30, v4 +; CHECK-LE-WACC-NEXT: vmr v28, v2 +; CHECK-LE-WACC-NEXT: std r30, 160(r1) # 8-byte Folded Spill +; CHECK-LE-WACC-NEXT: ld r30, 272(r1) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp60, vsp62, 0 +; CHECK-LE-WACC-NEXT: xvf16ger2pp wacc0, v2, v4 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxvp vsp36, 64(r1) +; CHECK-LE-WACC-NEXT: stxvp vsp34, 32(r1) +; CHECK-LE-WACC-NEXT: bl foo@notoc +; CHECK-LE-WACC-NEXT: lxvp vsp34, 64(r1) +; CHECK-LE-WACC-NEXT: lxvp vsp36, 32(r1) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-LE-WACC-NEXT: xvf16ger2pp wacc0, v28, v30 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r30) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r30) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r30) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r30) +; CHECK-LE-WACC-NEXT: lxv v31, 144(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v30, 128(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v29, 112(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: lxv v28, 96(r1) # 16-byte Folded Reload +; CHECK-LE-WACC-NEXT: ld r30, 160(r1) # 8-byte Folded Reload +; CHECK-LE-WACC-NEXT: addi r1, r1, 176 +; CHECK-LE-WACC-NEXT: ld r0, 16(r1) +; CHECK-LE-WACC-NEXT: mtlr r0 +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: intrinsics1: +; CHECK-BE-WACC: # %bb.0: +; CHECK-BE-WACC-NEXT: mflr r0 +; CHECK-BE-WACC-NEXT: std r0, 16(r1) +; CHECK-BE-WACC-NEXT: stdu r1, -256(r1) +; CHECK-BE-WACC-NEXT: .cfi_def_cfa_offset 256 +; CHECK-BE-WACC-NEXT: .cfi_offset lr, 16 +; CHECK-BE-WACC-NEXT: .cfi_offset r30, -16 +; CHECK-BE-WACC-NEXT: .cfi_offset v28, -80 +; CHECK-BE-WACC-NEXT: .cfi_offset v29, -64 +; CHECK-BE-WACC-NEXT: .cfi_offset v30, -48 +; CHECK-BE-WACC-NEXT: .cfi_offset v31, -32 +; CHECK-BE-WACC-NEXT: stxv v28, 176(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v29, 192(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v30, 208(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: stxv v31, 224(r1) # 16-byte Folded Spill +; CHECK-BE-WACC-NEXT: vmr v31, v5 +; CHECK-BE-WACC-NEXT: vmr v29, v3 +; CHECK-BE-WACC-NEXT: vmr v30, v4 +; CHECK-BE-WACC-NEXT: vmr v28, v2 +; CHECK-BE-WACC-NEXT: std r30, 240(r1) # 8-byte Folded Spill +; CHECK-BE-WACC-NEXT: ld r30, 368(r1) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp60, vsp62, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v2, v4 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxvp vsp36, 112(r1) +; CHECK-BE-WACC-NEXT: stxvp vsp34, 144(r1) +; CHECK-BE-WACC-NEXT: bl foo +; CHECK-BE-WACC-NEXT: nop +; CHECK-BE-WACC-NEXT: lxvp vsp34, 112(r1) +; CHECK-BE-WACC-NEXT: lxvp vsp36, 144(r1) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v28, v30 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r30) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r30) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r30) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r30) +; CHECK-BE-WACC-NEXT: lxv v31, 224(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v30, 208(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v29, 192(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: lxv v28, 176(r1) # 16-byte Folded Reload +; CHECK-BE-WACC-NEXT: ld r30, 240(r1) # 8-byte Folded Reload +; CHECK-BE-WACC-NEXT: addi r1, r1, 256 +; CHECK-BE-WACC-NEXT: ld r0, 16(r1) +; CHECK-BE-WACC-NEXT: mtlr r0 +; CHECK-BE-WACC-NEXT: blr %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4) %2 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc3) tail call void @foo() diff --git a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll index e932aec2c7134..7b36fa4f64f71 100644 --- a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC ; Function Attrs: nofree nounwind writeonly define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone %vpp, <16 x i8> %vc, ptr nocapture %resp) { @@ -27,6 +33,26 @@ define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test1: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: xvi16ger2 wacc0, v2, v2 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi16ger2 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -57,6 +83,26 @@ define dso_local void @test2(ptr nocapture readnone %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test2: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: pmxvi16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -97,6 +143,36 @@ define dso_local void @test3(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test3: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: xvi8ger4spp wacc0, v2, v2 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi8ger4spp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -138,6 +214,36 @@ define dso_local void @test4(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test4: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: xvi16ger2pp wacc0, v2, v2 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test4: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi16ger2pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -179,6 +285,36 @@ define dso_local void @test5(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test5: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test5: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -220,6 +356,36 @@ define dso_local void @test6(ptr nocapture readonly %vqp, ptr nocapture readnone ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-LE-WACC-LABEL: test6: +; CHECK-LE-WACC: # %bb.0: # %entry +; CHECK-LE-WACC-NEXT: lxv v5, 0(r3) +; CHECK-LE-WACC-NEXT: lxv v1, 32(r3) +; CHECK-LE-WACC-NEXT: lxv v4, 16(r3) +; CHECK-LE-WACC-NEXT: lxv v0, 48(r3) +; CHECK-LE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-LE-WACC-NEXT: pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-LE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-LE-WACC-NEXT: stxv v4, 48(r7) +; CHECK-LE-WACC-NEXT: stxv v5, 32(r7) +; CHECK-LE-WACC-NEXT: stxv v2, 16(r7) +; CHECK-LE-WACC-NEXT: stxv v3, 0(r7) +; CHECK-LE-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test6: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll index 8fbc9d785796d..3505cbb197bf9 100644 --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC ; assemble_acc declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) @@ -32,6 +38,28 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: ass_acc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: ass_acc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %ptr, align 64 @@ -66,6 +94,28 @@ define void @int_xxmtacc(ptr %ptr, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: int_xxmtacc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: int_xxmtacc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: ; One xxmtacc is generated from the call to assemble.acc then one xxmtacc is ; generated from the call to xxmtacc then one xxmfacc is generated for the store @@ -101,6 +151,28 @@ define void @int_xxmfacc(ptr %ptr, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: int_xxmfacc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: int_xxmfacc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: ; One xxmtacc is generated from the call to assemble.acc then one xxmfacc is ; generated from the call to xxmfacc then one xxmfacc is generated for the store @@ -132,6 +204,26 @@ define void @int_xxsetaccz(ptr %ptr) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: int_xxsetaccz: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: int_xxsetaccz: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() store <512 x i1> %0, ptr %ptr, align 64 @@ -160,6 +252,26 @@ define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) { ; CHECK-BE-NEXT: stxv vs2, 0(r5) ; CHECK-BE-NEXT: stxv vs3, 0(r6) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: disass_acc: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 0(r4) +; CHECK-WACC-NEXT: stxv v3, 0(r5) +; CHECK-WACC-NEXT: stxv v2, 0(r6) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: disass_acc: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 0(r4) +; CHECK-BE-WACC-NEXT: stxv v4, 0(r5) +; CHECK-BE-WACC-NEXT: stxv v5, 0(r6) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0) @@ -219,6 +331,50 @@ define void @testBranch(ptr %ptr, <16 x i8> %vc, i32 %val) { ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testBranch: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmplwi r7, 0 +; CHECK-WACC-NEXT: beq cr0, .LBB5_2 +; CHECK-WACC-NEXT: # %bb.1: # %if.then +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: b .LBB5_3 +; CHECK-WACC-NEXT: .LBB5_2: # %if.else +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-WACC-NEXT: .LBB5_3: # %if.end +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testBranch: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmplwi r7, 0 +; CHECK-BE-WACC-NEXT: beq cr0, .LBB5_2 +; CHECK-BE-WACC-NEXT: # %bb.1: # %if.then +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: b .LBB5_3 +; CHECK-BE-WACC-NEXT: .LBB5_2: # %if.else +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: .LBB5_3: # %if.end +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr entry: %tobool = icmp eq i32 %val, 0 br i1 %tobool, label %if.else, label %if.then @@ -273,6 +429,36 @@ define void @testcse(ptr %res, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 112(r3) ; CHECK-BE-NEXT: stxv vs2, 96(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 112(r3) +; CHECK-WACC-NEXT: stxv v5, 96(r3) +; CHECK-WACC-NEXT: stxv v2, 80(r3) +; CHECK-WACC-NEXT: stxv v3, 64(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 112(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() @@ -320,6 +506,42 @@ define void @testcse2(ptr %res, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 112(r3) ; CHECK-BE-NEXT: stxv vs2, 96(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r3) +; CHECK-WACC-NEXT: stxv v5, 96(r3) +; CHECK-WACC-NEXT: stxv v2, 80(r3) +; CHECK-WACC-NEXT: stxv v3, 64(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() @@ -367,6 +589,42 @@ define void @testcse3(ptr %res, <16 x i8> %vc) { ; CHECK-BE-NEXT: stxv vs3, 112(r3) ; CHECK-BE-NEXT: stxv vs2, 96(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse3: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r3) +; CHECK-WACC-NEXT: stxv v5, 96(r3) +; CHECK-WACC-NEXT: stxv v2, 80(r3) +; CHECK-WACC-NEXT: stxv v3, 64(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc1, v2, v2 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz() %1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -475,6 +733,104 @@ define void @testcse4(ptr %res, i32 %lim, ptr %vc) { ; CHECK-BE-NEXT: bdnz .LBB9_2 ; CHECK-BE-NEXT: # %bb.3: # %for.cond.cleanup ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testcse4: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmpwi r4, 1 +; CHECK-WACC-NEXT: bltlr cr0 +; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-WACC-NEXT: clrldi r4, r4, 32 +; CHECK-WACC-NEXT: mtctr r4 +; CHECK-WACC-NEXT: li r4, 0 +; CHECK-WACC-NEXT: li r6, 0 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB9_2: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: rldic r7, r6, 4, 28 +; CHECK-WACC-NEXT: add r8, r5, r7 +; CHECK-WACC-NEXT: lxvx vs0, r5, r7 +; CHECK-WACC-NEXT: lxv vs1, 16(r8) +; CHECK-WACC-NEXT: dmxxsetaccz wacc2 +; CHECK-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: xvf32gerpp wacc2, vs0, vs1 +; CHECK-WACC-NEXT: lxv vs0, 32(r8) +; CHECK-WACC-NEXT: lxv vs1, 48(r8) +; CHECK-WACC-NEXT: rldic r7, r4, 6, 26 +; CHECK-WACC-NEXT: addi r4, r4, 3 +; CHECK-WACC-NEXT: addi r6, r6, 6 +; CHECK-WACC-NEXT: xvf32gerpn wacc1, vs0, vs1 +; CHECK-WACC-NEXT: lxv vs0, 64(r8) +; CHECK-WACC-NEXT: lxv vs1, 80(r8) +; CHECK-WACC-NEXT: add r8, r3, r7 +; CHECK-WACC-NEXT: xvf32gernp wacc0, vs0, vs1 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc2, 0 +; CHECK-WACC-NEXT: stxvx v3, r3, r7 +; CHECK-WACC-NEXT: stxv v4, 48(r8) +; CHECK-WACC-NEXT: stxv v5, 32(r8) +; CHECK-WACC-NEXT: stxv v2, 16(r8) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r8) +; CHECK-WACC-NEXT: stxv v5, 96(r8) +; CHECK-WACC-NEXT: stxv v2, 80(r8) +; CHECK-WACC-NEXT: stxv v3, 64(r8) +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 176(r8) +; CHECK-WACC-NEXT: stxv v5, 160(r8) +; CHECK-WACC-NEXT: stxv v2, 144(r8) +; CHECK-WACC-NEXT: stxv v3, 128(r8) +; CHECK-WACC-NEXT: bdnz .LBB9_2 +; CHECK-WACC-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testcse4: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmpwi r4, 1 +; CHECK-BE-WACC-NEXT: bltlr cr0 +; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-WACC-NEXT: clrldi r4, r4, 32 +; CHECK-BE-WACC-NEXT: mtctr r4 +; CHECK-BE-WACC-NEXT: li r4, 0 +; CHECK-BE-WACC-NEXT: li r6, 0 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB9_2: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: rldic r7, r6, 4, 28 +; CHECK-BE-WACC-NEXT: add r8, r5, r7 +; CHECK-BE-WACC-NEXT: lxvx vs0, r5, r7 +; CHECK-BE-WACC-NEXT: lxv vs1, 16(r8) +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc2 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc1 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc2, vs0, vs1 +; CHECK-BE-WACC-NEXT: lxv vs0, 32(r8) +; CHECK-BE-WACC-NEXT: lxv vs1, 48(r8) +; CHECK-BE-WACC-NEXT: rldic r7, r4, 6, 26 +; CHECK-BE-WACC-NEXT: addi r4, r4, 3 +; CHECK-BE-WACC-NEXT: addi r6, r6, 6 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc1, vs0, vs1 +; CHECK-BE-WACC-NEXT: lxv vs0, 64(r8) +; CHECK-BE-WACC-NEXT: lxv vs1, 80(r8) +; CHECK-BE-WACC-NEXT: add r8, r3, r7 +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, vs0, vs1 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc2, 0 +; CHECK-BE-WACC-NEXT: stxvx v2, r3, r7 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r8) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r8) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r8) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc1, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r8) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r8) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r8) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r8) +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 176(r8) +; CHECK-BE-WACC-NEXT: stxv v4, 160(r8) +; CHECK-BE-WACC-NEXT: stxv v3, 144(r8) +; CHECK-BE-WACC-NEXT: stxv v2, 128(r8) +; CHECK-BE-WACC-NEXT: bdnz .LBB9_2 +; CHECK-BE-WACC-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: blr entry: %cmp55 = icmp sgt i32 %lim, 0 br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup @@ -600,6 +956,71 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind { ; CHECK-BE-NEXT: ld r0, 16(r1) ; CHECK-BE-NEXT: mtlr r0 ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testRedundantPrimeUnprime: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: mflr r0 +; CHECK-WACC-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-WACC-NEXT: std r0, 16(r1) +; CHECK-WACC-NEXT: stdu r1, -112(r1) +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0 +; CHECK-WACC-NEXT: stxv v0, 48(r3) +; CHECK-WACC-NEXT: stxv v1, 32(r3) +; CHECK-WACC-NEXT: stxv v4, 16(r3) +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-WACC-NEXT: mr r30, r3 +; CHECK-WACC-NEXT: stxvp vsp36, 64(r1) +; CHECK-WACC-NEXT: stxvp vsp34, 32(r1) +; CHECK-WACC-NEXT: bl testRedundantPrimeUnprimeF@notoc +; CHECK-WACC-NEXT: lxvp vsp34, 64(r1) +; CHECK-WACC-NEXT: lxvp vsp36, 32(r1) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 112(r30) +; CHECK-WACC-NEXT: stxv v5, 96(r30) +; CHECK-WACC-NEXT: stxv v2, 80(r30) +; CHECK-WACC-NEXT: stxv v3, 64(r30) +; CHECK-WACC-NEXT: addi r1, r1, 112 +; CHECK-WACC-NEXT: ld r0, 16(r1) +; CHECK-WACC-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-WACC-NEXT: mtlr r0 +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testRedundantPrimeUnprime: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: mflr r0 +; CHECK-BE-WACC-NEXT: std r0, 16(r1) +; CHECK-BE-WACC-NEXT: stdu r1, -192(r1) +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: std r30, 176(r1) # 8-byte Folded Spill +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v1, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v0, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 0(r3) +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 +; CHECK-BE-WACC-NEXT: mr r30, r3 +; CHECK-BE-WACC-NEXT: stxvp vsp36, 112(r1) +; CHECK-BE-WACC-NEXT: stxvp vsp34, 144(r1) +; CHECK-BE-WACC-NEXT: bl testRedundantPrimeUnprimeF +; CHECK-BE-WACC-NEXT: nop +; CHECK-BE-WACC-NEXT: lxvp vsp34, 112(r1) +; CHECK-BE-WACC-NEXT: lxvp vsp36, 144(r1) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 112(r30) +; CHECK-BE-WACC-NEXT: stxv v4, 96(r30) +; CHECK-BE-WACC-NEXT: stxv v3, 80(r30) +; CHECK-BE-WACC-NEXT: stxv v2, 64(r30) +; CHECK-BE-WACC-NEXT: ld r30, 176(r1) # 8-byte Folded Reload +; CHECK-BE-WACC-NEXT: addi r1, r1, 192 +; CHECK-BE-WACC-NEXT: ld r0, 16(r1) +; CHECK-BE-WACC-NEXT: mtlr r0 +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() store <512 x i1> %0, ptr %dst, align 64 @@ -646,6 +1067,38 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test_ldst_1: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: plxvp vsp36, 8(r4), 0 +; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test_ldst_1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: plxvp vsp36, 8(r4), 0 +; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = getelementptr i8, ptr %vpp, i64 8 @@ -688,6 +1141,38 @@ define void @test_ldst_2(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test_ldst_2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxvp vsp36, 0(r4) +; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test_ldst_2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxvp vsp36, 0(r4) +; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp) @@ -729,6 +1214,38 @@ define void @test_ldst_3(ptr nocapture readonly %vqp, i64 %offs, ptr %vpp, <16 x ; CHECK-BE-NEXT: stxv vs3, 48(r9) ; CHECK-BE-NEXT: stxv vs2, 32(r9) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test_ldst_3: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxvp vsp36, 0(r5) +; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r9) +; CHECK-WACC-NEXT: stxv v5, 32(r9) +; CHECK-WACC-NEXT: stxv v2, 16(r9) +; CHECK-WACC-NEXT: stxv v3, 0(r9) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test_ldst_3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxvp vsp36, 0(r5) +; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r9) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r9) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r9) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r9) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp) diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll index ac6ad41633492..ff860b8d6ff22 100644 --- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll @@ -5,6 +5,12 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -enable-subreg-liveness -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) @@ -56,6 +62,46 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: intrinsics1: +; CHECK-WACC: # %bb.0: +; CHECK-WACC-NEXT: vmr v1, v4 +; CHECK-WACC-NEXT: vmr v4, v3 +; CHECK-WACC-NEXT: vmr v0, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v4 +; CHECK-WACC-NEXT: ld r3, 96(r1) +; CHECK-WACC-NEXT: xvf16ger2pp wacc0, v0, v1 +; CHECK-WACC-NEXT: vmr v3, v2 +; CHECK-WACC-NEXT: vmr v2, v5 +; CHECK-WACC-NEXT: pmxvf32gerpn wacc0, v4, v5, 0, 0 +; CHECK-WACC-NEXT: pmxvf64gernp wacc0, vsp34, v0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r3) +; CHECK-WACC-NEXT: stxv v5, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: intrinsics1: +; CHECK-BE-WACC: # %bb.0: +; CHECK-BE-WACC-NEXT: vmr v1, v4 +; CHECK-BE-WACC-NEXT: vmr v4, v3 +; CHECK-BE-WACC-NEXT: vmr v0, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v4 +; CHECK-BE-WACC-NEXT: ld r3, 112(r1) +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v0, v1 +; CHECK-BE-WACC-NEXT: vmr v3, v2 +; CHECK-BE-WACC-NEXT: vmr v2, v5 +; CHECK-BE-WACC-NEXT: pmxvf32gerpn wacc0, v4, v5, 0, 0 +; CHECK-BE-WACC-NEXT: pmxvf64gernp wacc0, vsp34, v0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: blr %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc3, <16 x i8> %vc2, <16 x i8> %vc4) %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc2) %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3) @@ -115,6 +161,46 @@ define void @intrinsics2(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4, ptr %ptr) { ; CHECK-BE-NEXT: stxv vs2, 0(r5) ; CHECK-BE-NEXT: stxv vs3, 0(r6) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: intrinsics2: +; CHECK-WACC: # %bb.0: +; CHECK-WACC-NEXT: lxv v2, 0(r3) +; CHECK-WACC-NEXT: lxv v4, 0(r5) +; CHECK-WACC-NEXT: lxv v3, 0(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r6) +; CHECK-WACC-NEXT: vmr v1, v2 +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-WACC-NEXT: xvi8ger4pp wacc0, v2, v3 +; CHECK-WACC-NEXT: xvf16ger2pn wacc0, v2, v4 +; CHECK-WACC-NEXT: vmr v0, v5 +; CHECK-WACC-NEXT: pmxvf32gernn wacc0, v3, v5, 0, 0 +; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp32, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 0(r4) +; CHECK-WACC-NEXT: stxv v3, 0(r5) +; CHECK-WACC-NEXT: stxv v2, 0(r6) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: intrinsics2: +; CHECK-BE-WACC: # %bb.0: +; CHECK-BE-WACC-NEXT: lxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 0(r5) +; CHECK-BE-WACC-NEXT: lxv v3, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 0(r6) +; CHECK-BE-WACC-NEXT: vmr v1, v2 +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi8ger4pp wacc0, v2, v3 +; CHECK-BE-WACC-NEXT: xvf16ger2pn wacc0, v2, v4 +; CHECK-BE-WACC-NEXT: vmr v0, v5 +; CHECK-BE-WACC-NEXT: pmxvf32gernn wacc0, v3, v5, 0, 0 +; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp32, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 0(r4) +; CHECK-BE-WACC-NEXT: stxv v4, 0(r5) +; CHECK-BE-WACC-NEXT: stxv v5, 0(r6) +; CHECK-BE-WACC-NEXT: blr %vc1 = load <16 x i8>, ptr %ptr1, align 16 %vc2 = load <16 x i8>, ptr %ptr2, align 16 %vc3 = load <16 x i8>, ptr %ptr3, align 16 @@ -157,6 +243,26 @@ define void @test1(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test1: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvi4ger8 wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi4ger8 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -196,6 +302,36 @@ define void @test2(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi4ger8pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -226,6 +362,26 @@ define void @test3(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test3: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvi4ger8 wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test3: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi4ger8 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -265,6 +421,36 @@ define void @test4(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test4: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test4: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -295,6 +481,26 @@ define void @test5(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test5: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvi8ger4 wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test5: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi8ger4 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -334,6 +540,36 @@ define void @test6(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test6: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi8ger4pp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test6: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi8ger4pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -364,6 +600,26 @@ define void @test7(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test7: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvi8ger4 wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test7: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi8ger4 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -403,6 +659,36 @@ define void @test8(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test8: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test8: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -433,6 +719,26 @@ define void @test9(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test9: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvi16ger2s wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test9: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvi16ger2s wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -472,6 +778,36 @@ define void @test10(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test10: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvi16ger2spp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test10: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvi16ger2spp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -502,6 +838,26 @@ define void @test11(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test11: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvi16ger2s wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test11: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvi16ger2s wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -541,6 +897,36 @@ define void @test12(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test12: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test12: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -571,6 +957,26 @@ define void @test13(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test13: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvf16ger2 wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test13: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvf16ger2 wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -610,6 +1016,36 @@ define void @test14(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test14: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2pp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test14: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -650,6 +1086,36 @@ define void @test15(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test15: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2pn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test15: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2pn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -690,6 +1156,36 @@ define void @test16(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test16: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2np wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test16: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2np wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -730,6 +1226,36 @@ define void @test17(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test17: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf16ger2nn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test17: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf16ger2nn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -760,6 +1286,26 @@ define void @test18(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test18: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvf16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test18: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvf16ger2 wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -799,6 +1345,36 @@ define void @test19(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test19: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test19: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -839,6 +1415,36 @@ define void @test20(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test20: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test20: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -879,6 +1485,36 @@ define void @test21(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test21: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2np wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test21: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2np wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -919,6 +1555,36 @@ define void @test22(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test22: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test22: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0) @@ -949,6 +1615,26 @@ define void @test23(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test23: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: xvf32ger wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test23: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: xvf32ger wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %vc, <16 x i8> %vc) store <512 x i1> %0, ptr %resp, align 64 @@ -988,6 +1674,36 @@ define void @test24(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test24: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test24: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gerpp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1028,6 +1744,36 @@ define void @test25(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test25: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test25: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gerpn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1068,6 +1814,36 @@ define void @test26(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test26: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test26: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1108,6 +1884,36 @@ define void @test27(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test27: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: xvf32gernn wacc0, v2, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test27: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: xvf32gernn wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc) @@ -1138,6 +1944,26 @@ define void @test28(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test28: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: pmxvf32ger wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test28: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: pmxvf32ger wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) store <512 x i1> %0, ptr %resp, align 64 @@ -1177,6 +2003,36 @@ define void @test29(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test29: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gerpp wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test29: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gerpp wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1217,6 +2073,36 @@ define void @test30(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test30: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gerpn wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test30: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gerpn wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1257,6 +2143,36 @@ define void @test31(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test31: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gernp wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test31: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gernp wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1297,6 +2213,36 @@ define void @test32(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test32: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: pmxvf32gernn wacc0, v2, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test32: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: pmxvf32gernn wacc0, v2, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0) @@ -1331,6 +2277,30 @@ define void @test33(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test33: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64ger wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test33: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64ger wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <256 x i1>, ptr %vpp, align 32 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %0, <16 x i8> %vc) @@ -1375,6 +2345,40 @@ define void @test34(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test34: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test34: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1420,6 +2424,40 @@ define void @test35(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test35: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gerpn wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test35: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gerpn wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1465,6 +2503,40 @@ define void @test36(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test36: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test36: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gernp wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1510,6 +2582,40 @@ define void @test37(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test37: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: xvf64gernn wacc0, vsp36, v2 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test37: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: xvf64gernn wacc0, vsp36, v2 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1545,6 +2651,30 @@ define void @test38(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test38: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64ger wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test38: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64ger wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <256 x i1>, ptr %vpp, align 32 %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %0, <16 x i8> %vc, i32 0, i32 0) @@ -1589,6 +2719,40 @@ define void @test39(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test39: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gerpp wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test39: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gerpp wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1634,6 +2798,40 @@ define void @test40(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test40: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gerpn wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test40: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gerpn wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1679,6 +2877,40 @@ define void @test41(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test41: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gernp wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test41: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gernp wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 @@ -1724,6 +2956,40 @@ define void @test42(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: stxv vs3, 48(r7) ; CHECK-BE-NEXT: stxv vs2, 32(r7) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: test42: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v5, 0(r3) +; CHECK-WACC-NEXT: lxv v1, 32(r3) +; CHECK-WACC-NEXT: lxv v4, 16(r3) +; CHECK-WACC-NEXT: lxv v0, 48(r3) +; CHECK-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-WACC-NEXT: lxv v4, 16(r4) +; CHECK-WACC-NEXT: lxv v5, 0(r4) +; CHECK-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r7) +; CHECK-WACC-NEXT: stxv v5, 32(r7) +; CHECK-WACC-NEXT: stxv v2, 16(r7) +; CHECK-WACC-NEXT: stxv v3, 0(r7) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: test42: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: lxv v1, 16(r3) +; CHECK-BE-WACC-NEXT: lxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: lxv v0, 0(r3) +; CHECK-BE-WACC-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-WACC-NEXT: lxv v4, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v5, 16(r4) +; CHECK-BE-WACC-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r7) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r7) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r7) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r7) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <512 x i1>, ptr %vqp, align 64 %1 = load <256 x i1>, ptr %vpp, align 32 diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll index 89e5147aecc5f..37d0e69b3beaa 100644 --- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll +++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll @@ -5,6 +5,12 @@ ; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE +; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC +; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -mcpu=future -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) declare <512 x i1> @llvm.ppc.mma.xxsetaccz() @@ -64,6 +70,60 @@ define void @testPHI1(ptr %Dst, ptr %Src, i32 signext %Len) { ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testPHI1: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmpwi r5, 3 +; CHECK-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-WACC-NEXT: blt cr0, .LBB0_3 +; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-WACC-NEXT: addi r5, r5, -2 +; CHECK-WACC-NEXT: lxv v2, 0(r4) +; CHECK-WACC-NEXT: lxv v3, 16(r4) +; CHECK-WACC-NEXT: mtctr r5 +; CHECK-WACC-NEXT: addi r4, r4, 32 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB0_2: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-WACC-NEXT: addi r4, r4, 16 +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: bdnz .LBB0_2 +; CHECK-WACC-NEXT: .LBB0_3: # %for.cond.cleanup +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 48(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testPHI1: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmpwi r5, 3 +; CHECK-BE-WACC-NEXT: dmxxsetaccz wacc0 +; CHECK-BE-WACC-NEXT: blt cr0, .LBB0_3 +; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-BE-WACC-NEXT: addi r5, r5, -2 +; CHECK-BE-WACC-NEXT: lxv v2, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v3, 16(r4) +; CHECK-BE-WACC-NEXT: mtctr r5 +; CHECK-BE-WACC-NEXT: addi r4, r4, 32 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB0_2: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-BE-WACC-NEXT: addi r4, r4, 16 +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: bdnz .LBB0_2 +; CHECK-BE-WACC-NEXT: .LBB0_3: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <16 x i8>, ptr %Src, align 16 %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1 @@ -161,6 +221,62 @@ define dso_local void @testPHI2(ptr %Dst, ptr %Src, i32 signext %Len) { ; CHECK-BE-NEXT: stxv vs2, 32(r3) ; CHECK-BE-NEXT: stxv vs3, 48(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testPHI2: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: lxv v2, 0(r4) +; CHECK-WACC-NEXT: lxv v3, 16(r4) +; CHECK-WACC-NEXT: lxv vs0, 32(r4) +; CHECK-WACC-NEXT: cmpwi r5, 4 +; CHECK-WACC-NEXT: xvf64ger wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: blt cr0, .LBB1_3 +; CHECK-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-WACC-NEXT: addi r5, r5, -3 +; CHECK-WACC-NEXT: mtctr r5 +; CHECK-WACC-NEXT: addi r4, r4, 48 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB1_2: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-WACC-NEXT: addi r4, r4, 16 +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: bdnz .LBB1_2 +; CHECK-WACC-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v5, 0(r3) +; CHECK-WACC-NEXT: stxv v4, 16(r3) +; CHECK-WACC-NEXT: stxv v3, 32(r3) +; CHECK-WACC-NEXT: stxv v2, 48(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testPHI2: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: lxv v2, 0(r4) +; CHECK-BE-WACC-NEXT: lxv v3, 16(r4) +; CHECK-BE-WACC-NEXT: lxv vs0, 32(r4) +; CHECK-BE-WACC-NEXT: cmpwi r5, 4 +; CHECK-BE-WACC-NEXT: xvf64ger wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: blt cr0, .LBB1_3 +; CHECK-BE-WACC-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-WACC-NEXT: clrldi r5, r5, 32 +; CHECK-BE-WACC-NEXT: addi r5, r5, -3 +; CHECK-BE-WACC-NEXT: mtctr r5 +; CHECK-BE-WACC-NEXT: addi r4, r4, 48 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB1_2: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: lxv vs0, 0(r4) +; CHECK-BE-WACC-NEXT: addi r4, r4, 16 +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: bdnz .LBB1_2 +; CHECK-BE-WACC-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v2, 0(r3) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r3) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r3) +; CHECK-BE-WACC-NEXT: stxv v5, 48(r3) +; CHECK-BE-WACC-NEXT: blr entry: %0 = load <16 x i8>, ptr %Src, align 16 %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1 @@ -229,6 +345,28 @@ define void @testImplicitDef(ptr %ptr) { ; CHECK-BE-NEXT: xxmfacc acc0 ; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testImplicitDef: +; CHECK-WACC: # %bb.0: # %label1 +; CHECK-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-WACC-NEXT: bc 12, 4*cr5+lt, .LBB2_2 +; CHECK-WACC-NEXT: # %bb.1: # %label2 +; CHECK-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-WACC-NEXT: .LBB2_2: # %label3 +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: stxv v2, 0(r3) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testImplicitDef: +; CHECK-BE-WACC: # %bb.0: # %label1 +; CHECK-BE-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-BE-WACC-NEXT: bc 12, 4*cr5+lt, .LBB2_2 +; CHECK-BE-WACC-NEXT: # %bb.1: # %label2 +; CHECK-BE-WACC-NEXT: xvf64gerpp wacc0, vsp34, vs0 +; CHECK-BE-WACC-NEXT: .LBB2_2: # %label3 +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 0(r3) +; CHECK-BE-WACC-NEXT: blr label1: br i1 undef, label %label3, label %label2 @@ -312,6 +450,70 @@ define dso_local signext i32 @testNestedPHI(i32 signext %cond, i32 signext %coun ; CHECK-BE-NEXT: stxv vs3, 48(r5) ; CHECK-BE-NEXT: stxv vs2, 32(r5) ; CHECK-BE-NEXT: blr +; +; CHECK-WACC-LABEL: testNestedPHI: +; CHECK-WACC: # %bb.0: # %entry +; CHECK-WACC-NEXT: cmplwi r3, 0 +; CHECK-WACC-NEXT: beq cr0, .LBB3_2 +; CHECK-WACC-NEXT: # %bb.1: # %if.then +; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-WACC-NEXT: cmpwi r4, 1 +; CHECK-WACC-NEXT: bge cr0, .LBB3_3 +; CHECK-WACC-NEXT: b .LBB3_5 +; CHECK-WACC-NEXT: .LBB3_2: +; CHECK-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-WACC-NEXT: cmpwi r4, 1 +; CHECK-WACC-NEXT: blt cr0, .LBB3_5 +; CHECK-WACC-NEXT: .LBB3_3: # %for.body.preheader +; CHECK-WACC-NEXT: addi r3, r4, -1 +; CHECK-WACC-NEXT: clrldi r3, r3, 32 +; CHECK-WACC-NEXT: addi r3, r3, 1 +; CHECK-WACC-NEXT: mtctr r3 +; CHECK-WACC-NEXT: .p2align 4 +; CHECK-WACC-NEXT: .LBB3_4: # %for.body +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-WACC-NEXT: bdnz .LBB3_4 +; CHECK-WACC-NEXT: .LBB3_5: # %for.cond.cleanup +; CHECK-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-WACC-NEXT: li r3, 0 +; CHECK-WACC-NEXT: stxv v4, 48(r5) +; CHECK-WACC-NEXT: stxv v5, 32(r5) +; CHECK-WACC-NEXT: stxv v2, 16(r5) +; CHECK-WACC-NEXT: stxv v3, 0(r5) +; CHECK-WACC-NEXT: blr +; +; CHECK-BE-WACC-LABEL: testNestedPHI: +; CHECK-BE-WACC: # %bb.0: # %entry +; CHECK-BE-WACC-NEXT: cmplwi r3, 0 +; CHECK-BE-WACC-NEXT: beq cr0, .LBB3_2 +; CHECK-BE-WACC-NEXT: # %bb.1: # %if.then +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: cmpwi r4, 1 +; CHECK-BE-WACC-NEXT: bge cr0, .LBB3_3 +; CHECK-BE-WACC-NEXT: b .LBB3_5 +; CHECK-BE-WACC-NEXT: .LBB3_2: +; CHECK-BE-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-BE-WACC-NEXT: cmpwi r4, 1 +; CHECK-BE-WACC-NEXT: blt cr0, .LBB3_5 +; CHECK-BE-WACC-NEXT: .LBB3_3: # %for.body.preheader +; CHECK-BE-WACC-NEXT: addi r3, r4, -1 +; CHECK-BE-WACC-NEXT: clrldi r3, r3, 32 +; CHECK-BE-WACC-NEXT: addi r3, r3, 1 +; CHECK-BE-WACC-NEXT: mtctr r3 +; CHECK-BE-WACC-NEXT: .p2align 4 +; CHECK-BE-WACC-NEXT: .LBB3_4: # %for.body +; CHECK-BE-WACC-NEXT: # +; CHECK-BE-WACC-NEXT: xvf32gernp wacc0, v2, v2 +; CHECK-BE-WACC-NEXT: bdnz .LBB3_4 +; CHECK-BE-WACC-NEXT: .LBB3_5: # %for.cond.cleanup +; CHECK-BE-WACC-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 +; CHECK-BE-WACC-NEXT: li r3, 0 +; CHECK-BE-WACC-NEXT: stxv v5, 48(r5) +; CHECK-BE-WACC-NEXT: stxv v4, 32(r5) +; CHECK-BE-WACC-NEXT: stxv v3, 16(r5) +; CHECK-BE-WACC-NEXT: stxv v2, 0(r5) +; CHECK-BE-WACC-NEXT: blr entry: %tobool.not = icmp eq i32 %cond, 0 br i1 %tobool.not, label %if.end, label %if.then diff --git a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll index 291cf97fd009e..929bf5f61dd90 100644 --- a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll +++ b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -verify-machineinstrs -mcpu=ppc -mtriple=powerpc64-ibm-aix < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mcpu=future \ +; RUN: -mtriple=powerpc64-ibm-aix < %s | FileCheck %s --check-prefix=CHECK-WACC target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512" @@ -38,6 +40,43 @@ define void @baz(i64 %arg) local_unnamed_addr #0 { ; CHECK-NEXT: xxswapd 0, 0 ; CHECK-NEXT: stxv 0, 0(3) ; CHECK-NEXT: blr +; +; CHECK-WACC-LABEL: baz: +; CHECK-WACC: # %bb.0: # %bb +; CHECK-WACC-NEXT: dmxxextfdmr512 34, 36, 0, 0 +; CHECK-WACC-NEXT: xxmrgld 1, 34, 36 +; CHECK-WACC-NEXT: xxswapd 2, 1 +; CHECK-WACC-NEXT: xxlxor 0, 0, 0 +; CHECK-WACC-NEXT: xvnegdp 1, 1 +; CHECK-WACC-NEXT: xvnegdp 2, 2 +; CHECK-WACC-NEXT: xvsubdp 1, 1, 0 +; CHECK-WACC-NEXT: xvsubdp 2, 2, 37 +; CHECK-WACC-NEXT: xvmuldp 1, 1, 0 +; CHECK-WACC-NEXT: xvmuldp 2, 2, 0 +; CHECK-WACC-NEXT: xvmaddadp 1, 0, 0 +; CHECK-WACC-NEXT: xvmaddadp 2, 0, 0 +; CHECK-WACC-NEXT: stxv 1, 0(3) +; CHECK-WACC-NEXT: stxv 2, 0(3) +; CHECK-WACC-NEXT: # implicit-def: $wacc0 +; CHECK-WACC-NEXT: bc 12, 20, L..BB0_2 +; CHECK-WACC-NEXT: # %bb.1: # %bb10 +; CHECK-WACC-NEXT: xvf64gerpp 0, 34, 0 +; CHECK-WACC-NEXT: L..BB0_2: # %bb12 +; CHECK-WACC-NEXT: cmpdi 3, 0 +; CHECK-WACC-NEXT: .align 4 +; CHECK-WACC-NEXT: L..BB0_3: # %bb13 +; CHECK-WACC-NEXT: # +; CHECK-WACC-NEXT: bc 4, 2, L..BB0_3 +; CHECK-WACC-NEXT: # %bb.4: # %bb14 +; CHECK-WACC-NEXT: dmxxextfdmr512 34, 36, 0, 0 +; CHECK-WACC-NEXT: xxlxor 0, 0, 0 +; CHECK-WACC-NEXT: xvsubdp 1, 0, 35 +; CHECK-WACC-NEXT: xxlxor 2, 2, 2 +; CHECK-WACC-NEXT: xvmaddadp 2, 1, 2 +; CHECK-WACC-NEXT: xvadddp 0, 2, 0 +; CHECK-WACC-NEXT: xxswapd 0, 0 +; CHECK-WACC-NEXT: stxv 0, 0(3) +; CHECK-WACC-NEXT: blr bb: %call = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> poison) %extractvalue = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %call, 0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll new file mode 100644 index 0000000000000..5cb55f15c7c8c --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll @@ -0,0 +1,1341 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll new file mode 100644 index 0000000000000..fafd45b7579e8 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll @@ -0,0 +1,5100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vloxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei16.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vloxei16.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vloxei16.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> poison, + ptr %0, + <vscale x 64 x i8> %1, + iXLen %2) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vloxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vloxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vloxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll new file mode 100644 index 0000000000000..916af2556c6a8 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll @@ -0,0 +1,1341 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei64.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei64.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei64.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei64.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i64> %1, + i64 %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64, + i64); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i64> %1, + i64 %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64, + i64); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i64> %1, + i64 %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64, + i64); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei64.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i64> %1, + i64 %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64, + i64); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4, i64 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll new file mode 100644 index 0000000000000..8dd32a1d640dc --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll @@ -0,0 +1,5100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vluxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei32.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei32.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei32.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i32> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei32.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i32> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i32> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i32> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i32> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei16.v v10, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vluxei16.v v12, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vluxei16.v v16, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i16> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i16> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei16.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i16> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i16> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i16> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i16> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v8 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> poison, + ptr %0, + <vscale x 64 x i8> %1, + iXLen %2) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen, + iXLen); + +define <vscale x 64 x i8> @intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 64 x i8> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x i16> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x i32> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x i64> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8( + <vscale x 1 x half> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8( + <vscale x 2 x half> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8( + <vscale x 4 x half> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8( + <vscale x 8 x half> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8( + <vscale x 16 x half> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv4r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8( + <vscale x 32 x half> poison, + ptr %0, + <vscale x 32 x i8> %1, + iXLen %2) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen, + iXLen); + +define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 32 x half> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8( + <vscale x 1 x float> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8( + <vscale x 2 x float> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8( + <vscale x 4 x float> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8( + <vscale x 8 x float> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8( + <vscale x 16 x float> poison, + ptr %0, + <vscale x 16 x i8> %1, + iXLen %2) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen, + iXLen); + +define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 16 x float> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vluxei8.v v9, (a0), v8 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8( + <vscale x 1 x double> poison, + ptr %0, + <vscale x 1 x i8> %1, + iXLen %2) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen, + iXLen); + +define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 1 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8( + <vscale x 2 x double> poison, + ptr %0, + <vscale x 2 x i8> %1, + iXLen %2) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen, + iXLen); + +define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 2 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8( + <vscale x 4 x double> poison, + ptr %0, + <vscale x 4 x i8> %1, + iXLen %2) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen, + iXLen); + +define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 4 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; CHECK-NEXT: vmv1r.v v16, v8 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vluxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8( + <vscale x 8 x double> poison, + ptr %0, + <vscale x 8 x i8> %1, + iXLen %2) + + ret <vscale x 8 x double> %a +} + +declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen, + iXLen); + +define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: vluxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4, iXLen 1) + + ret <vscale x 8 x double> %a +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll new file mode 100644 index 0000000000000..4963d91a14988 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll @@ -0,0 +1,1293 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll new file mode 100644 index 0000000000000..7ea2e1734e5a2 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll @@ -0,0 +1,4881 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsoxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll new file mode 100644 index 0000000000000..9bd272a368d20 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll @@ -0,0 +1,1310 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -global-isel -verify-machineinstrs \ +; RUN: < %s | FileCheck %s + +; The intrinsics are not supported with RV32. + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +define void @intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> splat (i1 true), + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i64>, + <vscale x 1 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i64> %2, + <vscale x 1 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i64>, + <vscale x 2 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i64> %2, + <vscale x 2 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i64>, + <vscale x 4 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i64> %2, + <vscale x 4 x i1> %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + i64); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i64>, + <vscale x 8 x i1>, + i64); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei64.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i64> %2, + <vscale x 8 x i1> %3, + i64 %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll new file mode 100644 index 0000000000000..7cd15454d40b9 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll @@ -0,0 +1,4881 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ +; RUN: -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i32>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i32> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i32>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i32> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i32>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i32> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i32>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i32> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i32>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei32.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i32> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i16>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i16> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i16>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i16> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i16>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i16> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i16>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i16> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i16>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i16> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i16>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei16.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i16> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8( + <vscale x 1 x i8> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8( + <vscale x 2 x i8> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8( + <vscale x 4 x i8> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8( + <vscale x 8 x i8> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8( + <vscale x 16 x i8> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8( + <vscale x 32 x i8> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8>, + ptr, + <vscale x 64 x i8>, + <vscale x 64 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8( + <vscale x 64 x i8> %0, + ptr %1, + <vscale x 64 x i8> %2, + <vscale x 64 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8( + <vscale x 1 x i16> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8( + <vscale x 2 x i16> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8( + <vscale x 4 x i16> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8( + <vscale x 8 x i16> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8( + <vscale x 16 x i16> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8( + <vscale x 32 x i16> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8( + <vscale x 1 x i32> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8( + <vscale x 2 x i32> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8( + <vscale x 4 x i32> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8( + <vscale x 8 x i32> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8( + <vscale x 16 x i32> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8( + <vscale x 1 x i64> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8( + <vscale x 2 x i64> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8( + <vscale x 4 x i64> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8( + <vscale x 8 x i64> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8( + <vscale x 1 x half> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8( + <vscale x 2 x half> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8( + <vscale x 4 x half> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8( + <vscale x 8 x half> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8( + <vscale x 16 x half> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half>, + ptr, + <vscale x 32 x i8>, + <vscale x 32 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8( + <vscale x 32 x half> %0, + ptr %1, + <vscale x 32 x i8> %2, + <vscale x 32 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8( + <vscale x 1 x float> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8( + <vscale x 2 x float> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8( + <vscale x 4 x float> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8( + <vscale x 8 x float> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float>, + ptr, + <vscale x 16 x i8>, + <vscale x 16 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8( + <vscale x 16 x float> %0, + ptr %1, + <vscale x 16 x i8> %2, + <vscale x 16 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double>, + ptr, + <vscale x 1 x i8>, + <vscale x 1 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v9, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8( + <vscale x 1 x double> %0, + ptr %1, + <vscale x 1 x i8> %2, + <vscale x 1 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double>, + ptr, + <vscale x 2 x i8>, + <vscale x 2 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v10, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8( + <vscale x 2 x double> %0, + ptr %1, + <vscale x 2 x i8> %2, + <vscale x 2 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double>, + ptr, + <vscale x 4 x i8>, + <vscale x 4 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v12, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8( + <vscale x 4 x double> %0, + ptr %1, + <vscale x 4 x i8> %2, + <vscale x 4 x i1> %3, + iXLen %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + iXLen); + +define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16 +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + iXLen %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double>, + ptr, + <vscale x 8 x i8>, + <vscale x 8 x i1>, + iXLen); + +define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind { +; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; CHECK-NEXT: vsuxei8.v v8, (a0), v16, v0.t +; CHECK-NEXT: ret +entry: + call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8( + <vscale x 8 x double> %0, + ptr %1, + <vscale x 8 x i8> %2, + <vscale x 8 x i1> %3, + iXLen %4) + + ret void +} diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 988d0490afeb6..cf44af608542c 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -137,6 +137,7 @@ ; CHECK-NEXT: shifted-zextw-fusion - Enable SLLI+SRLI to be fused when computing (shifted) word zero extension. ; CHECK-NEXT: shlcofideleg - 'Shlcofideleg' (Delegating LCOFI Interrupts to VS-mode). ; CHECK-NEXT: short-forward-branch-i-minmax - Enable short forward branch optimization for min,max instructions in Zbb. +; CHECK-NEXT: short-forward-branch-i-mul - Enable short forward branch optimization for mul instruction. ; CHECK-NEXT: short-forward-branch-opt - Enable short forward branch optimization. ; CHECK-NEXT: shtvala - 'Shtvala' (htval provides all needed values). ; CHECK-NEXT: shvsatpa - 'Shvsatpa' (vsatp supports all modes supported by satp). diff --git a/llvm/test/CodeGen/RISCV/mask-variable-shift.ll b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll new file mode 100644 index 0000000000000..4e73cee30ef08 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV32 +; RUN: llc -mtriple=riscv64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV64 + +define i32 @mask_pair(i32 %x, i32 %y) { +; RV32-LABEL: mask_pair: +; RV32: # %bb.0: +; RV32-NEXT: srl a0, a0, a1 +; RV32-NEXT: sll a0, a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: mask_pair: +; RV64: # %bb.0: +; RV64-NEXT: srlw a0, a0, a1 +; RV64-NEXT: sllw a0, a0, a1 +; RV64-NEXT: ret + %shl = shl nsw i32 -1, %y + %and = and i32 %shl, %x + ret i32 %and +} + +define i64 @mask_pair_64(i64 %x, i64 %y) { +; RV32-LABEL: mask_pair_64: +; RV32: # %bb.0: +; RV32-NEXT: li a3, -1 +; RV32-NEXT: addi a4, a2, -32 +; RV32-NEXT: sll a3, a3, a2 +; RV32-NEXT: bltz a4, .LBB1_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: j .LBB1_3 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: not a2, a2 +; RV32-NEXT: lui a5, 524288 +; RV32-NEXT: addi a5, a5, -1 +; RV32-NEXT: srl a2, a5, a2 +; RV32-NEXT: or a2, a3, a2 +; RV32-NEXT: .LBB1_3: +; RV32-NEXT: srai a4, a4, 31 +; RV32-NEXT: and a3, a4, a3 +; RV32-NEXT: and a1, a2, a1 +; RV32-NEXT: and a0, a3, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: mask_pair_64: +; RV64: # %bb.0: +; RV64-NEXT: srl a0, a0, a1 +; RV64-NEXT: sll a0, a0, a1 +; RV64-NEXT: ret + %shl = shl nsw i64 -1, %y + %and = and i64 %shl, %x + ret i64 %and +} + +define i128 @mask_pair_128(i128 %x, i128 %y) { +; RV32-LABEL: mask_pair_128: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: lw a5, 0(a1) +; RV32-NEXT: lw a4, 4(a1) +; RV32-NEXT: lw a3, 8(a1) +; RV32-NEXT: lw a1, 12(a1) +; RV32-NEXT: lw a2, 0(a2) +; RV32-NEXT: li a6, -1 +; RV32-NEXT: sw zero, 0(sp) +; RV32-NEXT: sw zero, 4(sp) +; RV32-NEXT: sw zero, 8(sp) +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: addi a7, sp, 16 +; RV32-NEXT: sw a6, 16(sp) +; RV32-NEXT: sw a6, 20(sp) +; RV32-NEXT: sw a6, 24(sp) +; RV32-NEXT: sw a6, 28(sp) +; RV32-NEXT: srli a6, a2, 3 +; RV32-NEXT: andi a6, a6, 12 +; RV32-NEXT: sub a6, a7, a6 +; RV32-NEXT: lw a7, 4(a6) +; RV32-NEXT: lw t0, 8(a6) +; RV32-NEXT: lw t1, 12(a6) +; RV32-NEXT: lw a6, 0(a6) +; RV32-NEXT: andi t2, a2, 31 +; RV32-NEXT: xori t2, t2, 31 +; RV32-NEXT: sll t1, t1, a2 +; RV32-NEXT: srli t3, t0, 1 +; RV32-NEXT: sll t0, t0, a2 +; RV32-NEXT: srli t4, a7, 1 +; RV32-NEXT: sll a7, a7, a2 +; RV32-NEXT: sll a2, a6, a2 +; RV32-NEXT: srli a6, a6, 1 +; RV32-NEXT: srl t3, t3, t2 +; RV32-NEXT: srl t4, t4, t2 +; RV32-NEXT: srl a6, a6, t2 +; RV32-NEXT: and a2, a2, a5 +; RV32-NEXT: or a5, t1, t3 +; RV32-NEXT: or t0, t0, t4 +; RV32-NEXT: or a6, a7, a6 +; RV32-NEXT: and a4, a6, a4 +; RV32-NEXT: and a3, t0, a3 +; RV32-NEXT: and a1, a5, a1 +; RV32-NEXT: sw a2, 0(a0) +; RV32-NEXT: sw a4, 4(a0) +; RV32-NEXT: sw a3, 8(a0) +; RV32-NEXT: sw a1, 12(a0) +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: mask_pair_128: +; RV64: # %bb.0: +; RV64-NEXT: li a5, -1 +; RV64-NEXT: addi a4, a2, -64 +; RV64-NEXT: sll a3, a5, a2 +; RV64-NEXT: bltz a4, .LBB2_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: j .LBB2_3 +; RV64-NEXT: .LBB2_2: +; RV64-NEXT: not a2, a2 +; RV64-NEXT: srli a5, a5, 1 +; RV64-NEXT: srl a2, a5, a2 +; RV64-NEXT: or a2, a3, a2 +; RV64-NEXT: .LBB2_3: +; RV64-NEXT: srai a4, a4, 63 +; RV64-NEXT: and a3, a4, a3 +; RV64-NEXT: and a1, a2, a1 +; RV64-NEXT: and a0, a3, a0 +; RV64-NEXT: ret + %shl = shl nsw i128 -1, %y + %and = and i128 %shl, %x + ret i128 %and +} diff --git a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll index c489bc3681876..aa63552eb4b63 100644 --- a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll +++ b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll @@ -488,5 +488,5 @@ declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>) declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>) ;. ; CHECK: attributes #[[ATTR0]] = { "target-features"="+v" } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" } ;. diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll new file mode 100644 index 0000000000000..bf0a2e5e35a01 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll @@ -0,0 +1,41 @@ +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh < %s | FileCheck %s + +; CHECK-LABEL: .section .llvm_stackmaps +; CHECK-NEXT: __LLVM_StackMaps: +; Header +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 0 +; Num Functions +; CHECK-NEXT: .word 1 +; Num LargeConstants +; CHECK-NEXT: .word 0 +; Num Callsites +; CHECK-NEXT: .word 1 + +; Functions and stack size +; CHECK-NEXT: .quad liveArgs +; CHECK-NEXT: .quad 0 +; CHECK-NEXT: .quad 1 + +; Spilled stack map values. +; +; Verify 3 stack map entries. +; +; CHECK-LABEL: .word .L{{.*}}-liveArgs +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .half 25 +; +; Check that at least one is a spilled entry from SP. +; Location: Indirect SP + ... +; CHECK: .byte 3 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .half 8 +; CHECK-NEXT: .half 2 +; CHECK-NEXT: .half 0 +; CHECK-NEXT: .word +define void @liveArgs(double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29) { +entry: + call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll index c50a0fb3ffe91..320a3aa94cd7d 100644 --- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll +++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll @@ -286,8 +286,8 @@ define void @liveConstant() { ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .half 28 ; -; Check that at least one is a spilled entry from RBP. -; Location: Indirect RBP + ... +; Check that at least one is a spilled entry from SP. +; Location: Indirect SP + ... ; CHECK: .byte 3 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 @@ -307,7 +307,7 @@ entry: ; CHECK-NEXT: .half 0 ; 1 location ; CHECK-NEXT: .half 1 -; Loc 0: Direct RBP - ofs +; Loc 0: Direct SP + ofs ; CHECK-NEXT: .byte 2 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 @@ -320,14 +320,14 @@ entry: ; CHECK-NEXT: .half 0 ; 2 locations ; CHECK-NEXT: .half 2 -; Loc 0: Direct RBP - ofs +; Loc 0: Direct SP + ofs ; CHECK-NEXT: .byte 2 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 ; CHECK-NEXT: .half 2 ; CHECK-NEXT: .half 0 ; CHECK-NEXT: .word -; Loc 1: Direct RBP - ofs +; Loc 1: Direct SP + ofs ; CHECK-NEXT: .byte 2 ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .half 8 diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll new file mode 100644 index 0000000000000..3f780fddafcce --- /dev/null +++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m | FileCheck %s --check-prefixes=RV32I-M +; RUN: llc < %s -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefixes=RV64I-M +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFB-M +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-opt | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFB-M +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-i-mul | \ +; RUN: FileCheck %s --check-prefixes=RV32I-SFBIMul-M +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-i-mul | \ +; RUN: FileCheck %s --check-prefixes=RV64I-SFBIMul-M + +define i32 @select_example_mul_i32(i32 %a, i32 %b, i1 zeroext %x, i32 %y) { +; RV32I-M-LABEL: select_example_mul_i32: +; RV32I-M: # %bb.0: # %entry +; RV32I-M-NEXT: beqz a2, .LBB0_2 +; RV32I-M-NEXT: # %bb.1: +; RV32I-M-NEXT: mul a1, a0, a3 +; RV32I-M-NEXT: .LBB0_2: # %entry +; RV32I-M-NEXT: mv a0, a1 +; RV32I-M-NEXT: ret +; +; RV64I-M-LABEL: select_example_mul_i32: +; RV64I-M: # %bb.0: # %entry +; RV64I-M-NEXT: beqz a2, .LBB0_2 +; RV64I-M-NEXT: # %bb.1: +; RV64I-M-NEXT: mulw a1, a0, a3 +; RV64I-M-NEXT: .LBB0_2: # %entry +; RV64I-M-NEXT: mv a0, a1 +; RV64I-M-NEXT: ret +; +; RV32I-SFB-M-LABEL: select_example_mul_i32: +; RV32I-SFB-M: # %bb.0: # %entry +; RV32I-SFB-M-NEXT: mul a0, a0, a3 +; RV32I-SFB-M-NEXT: bnez a2, .LBB0_2 +; RV32I-SFB-M-NEXT: # %bb.1: # %entry +; RV32I-SFB-M-NEXT: mv a0, a1 +; RV32I-SFB-M-NEXT: .LBB0_2: # %entry +; RV32I-SFB-M-NEXT: ret +; +; RV64I-SFB-M-LABEL: select_example_mul_i32: +; RV64I-SFB-M: # %bb.0: # %entry +; RV64I-SFB-M-NEXT: mulw a0, a0, a3 +; RV64I-SFB-M-NEXT: bnez a2, .LBB0_2 +; RV64I-SFB-M-NEXT: # %bb.1: # %entry +; RV64I-SFB-M-NEXT: mv a0, a1 +; RV64I-SFB-M-NEXT: .LBB0_2: # %entry +; RV64I-SFB-M-NEXT: ret +; +; RV32I-SFBIMul-M-LABEL: select_example_mul_i32: +; RV32I-SFBIMul-M: # %bb.0: # %entry +; RV32I-SFBIMul-M-NEXT: beqz a2, .LBB0_2 +; RV32I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV32I-SFBIMul-M-NEXT: mul a1, a0, a3 +; RV32I-SFBIMul-M-NEXT: .LBB0_2: # %entry +; RV32I-SFBIMul-M-NEXT: mv a0, a1 +; RV32I-SFBIMul-M-NEXT: ret +; +; RV64I-SFBIMul-M-LABEL: select_example_mul_i32: +; RV64I-SFBIMul-M: # %bb.0: # %entry +; RV64I-SFBIMul-M-NEXT: mulw a0, a0, a3 +; RV64I-SFBIMul-M-NEXT: bnez a2, .LBB0_2 +; RV64I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV64I-SFBIMul-M-NEXT: mv a0, a1 +; RV64I-SFBIMul-M-NEXT: .LBB0_2: # %entry +; RV64I-SFBIMul-M-NEXT: ret +entry: + %res = mul i32 %a, %y + %sel = select i1 %x, i32 %res, i32 %b + ret i32 %sel +} + +define i64 @select_example_mul_i64(i64 %a, i64 %b, i1 zeroext %x, i64 %y) { +; RV32I-M-LABEL: select_example_mul_i64: +; RV32I-M: # %bb.0: # %entry +; RV32I-M-NEXT: beqz a4, .LBB1_2 +; RV32I-M-NEXT: # %bb.1: +; RV32I-M-NEXT: mul a2, a0, a6 +; RV32I-M-NEXT: mulhu a3, a0, a5 +; RV32I-M-NEXT: mul a1, a1, a5 +; RV32I-M-NEXT: add a2, a3, a2 +; RV32I-M-NEXT: add a3, a2, a1 +; RV32I-M-NEXT: mul a2, a0, a5 +; RV32I-M-NEXT: .LBB1_2: # %entry +; RV32I-M-NEXT: mv a0, a2 +; RV32I-M-NEXT: mv a1, a3 +; RV32I-M-NEXT: ret +; +; RV64I-M-LABEL: select_example_mul_i64: +; RV64I-M: # %bb.0: # %entry +; RV64I-M-NEXT: beqz a2, .LBB1_2 +; RV64I-M-NEXT: # %bb.1: +; RV64I-M-NEXT: mul a1, a0, a3 +; RV64I-M-NEXT: .LBB1_2: # %entry +; RV64I-M-NEXT: mv a0, a1 +; RV64I-M-NEXT: ret +; +; RV32I-SFB-M-LABEL: select_example_mul_i64: +; RV32I-SFB-M: # %bb.0: # %entry +; RV32I-SFB-M-NEXT: mul a6, a0, a6 +; RV32I-SFB-M-NEXT: mulhu a7, a0, a5 +; RV32I-SFB-M-NEXT: mul a1, a1, a5 +; RV32I-SFB-M-NEXT: mul a0, a0, a5 +; RV32I-SFB-M-NEXT: add a6, a7, a6 +; RV32I-SFB-M-NEXT: beqz a4, .LBB1_2 +; RV32I-SFB-M-NEXT: # %bb.1: # %entry +; RV32I-SFB-M-NEXT: add a3, a6, a1 +; RV32I-SFB-M-NEXT: .LBB1_2: # %entry +; RV32I-SFB-M-NEXT: bnez a4, .LBB1_4 +; RV32I-SFB-M-NEXT: # %bb.3: # %entry +; RV32I-SFB-M-NEXT: mv a0, a2 +; RV32I-SFB-M-NEXT: .LBB1_4: # %entry +; RV32I-SFB-M-NEXT: mv a1, a3 +; RV32I-SFB-M-NEXT: ret +; +; RV64I-SFB-M-LABEL: select_example_mul_i64: +; RV64I-SFB-M: # %bb.0: # %entry +; RV64I-SFB-M-NEXT: mul a0, a0, a3 +; RV64I-SFB-M-NEXT: bnez a2, .LBB1_2 +; RV64I-SFB-M-NEXT: # %bb.1: # %entry +; RV64I-SFB-M-NEXT: mv a0, a1 +; RV64I-SFB-M-NEXT: .LBB1_2: # %entry +; RV64I-SFB-M-NEXT: ret +; +; RV32I-SFBIMul-M-LABEL: select_example_mul_i64: +; RV32I-SFBIMul-M: # %bb.0: # %entry +; RV32I-SFBIMul-M-NEXT: mul a6, a0, a6 +; RV32I-SFBIMul-M-NEXT: mulhu a7, a0, a5 +; RV32I-SFBIMul-M-NEXT: mul a1, a1, a5 +; RV32I-SFBIMul-M-NEXT: add a6, a7, a6 +; RV32I-SFBIMul-M-NEXT: beqz a4, .LBB1_2 +; RV32I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV32I-SFBIMul-M-NEXT: add a3, a6, a1 +; RV32I-SFBIMul-M-NEXT: .LBB1_2: # %entry +; RV32I-SFBIMul-M-NEXT: beqz a4, .LBB1_4 +; RV32I-SFBIMul-M-NEXT: # %bb.3: # %entry +; RV32I-SFBIMul-M-NEXT: mul a2, a0, a5 +; RV32I-SFBIMul-M-NEXT: .LBB1_4: # %entry +; RV32I-SFBIMul-M-NEXT: mv a0, a2 +; RV32I-SFBIMul-M-NEXT: mv a1, a3 +; RV32I-SFBIMul-M-NEXT: ret +; +; RV64I-SFBIMul-M-LABEL: select_example_mul_i64: +; RV64I-SFBIMul-M: # %bb.0: # %entry +; RV64I-SFBIMul-M-NEXT: beqz a2, .LBB1_2 +; RV64I-SFBIMul-M-NEXT: # %bb.1: # %entry +; RV64I-SFBIMul-M-NEXT: mul a1, a0, a3 +; RV64I-SFBIMul-M-NEXT: .LBB1_2: # %entry +; RV64I-SFBIMul-M-NEXT: mv a0, a1 +; RV64I-SFBIMul-M-NEXT: ret +entry: + %res = mul i64 %a, %y + %sel = select i1 %x, i64 %res, i64 %b + ret i64 %sel +} + diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll index ec4884ff643cb..3e0d0cc4cd8e2 100644 --- a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll +++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll @@ -1,7 +1,9 @@ ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-unknown-unknown %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR +; RUN: llc --verify-machineinstrs --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-amd-amdhsa %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_non_semantic_info %s -o - | FileCheck %s --check-prefix=CHECK-OPTION -; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer. +; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer. ; DISABLED: %if spirv-tools %{ llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-MIR-DAG: [[i32type:%[0-9]+\:type]] = OpTypeInt 32, 0 diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll index b1a555a52f40d..6b4e35e997124 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll @@ -7,6 +7,8 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_optnone,+SPV_INTEL_optnone %s -o - | FileCheck %s --check-prefixes=CHECK-TWO-EXTENSIONS ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS + ; CHECK-EXTENSION: OpCapability OptNoneEXT ; CHECK-EXTENSION: OpExtension "SPV_EXT_optnone" ; CHECK-NO-EXTENSION-NOT: OpCapability OptNoneINTEL diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll new file mode 100644 index 0000000000000..4cabddb94df25 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll @@ -0,0 +1,142 @@ +; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %} + +; CHECK-ERROR: LLVM ERROR: Arithmetic instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic + +; CHECK-DAG: OpCapability BFloat16TypeKHR +; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL +; CHECK-DAG: OpExtension "SPV_KHR_bfloat16" +; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic" +; CHECK-DAG: OpName [[NEG:%.*]] "neg" +; CHECK-DAG: OpName [[NEGV:%.*]] "negv" +; CHECK-DAG: OpName [[ADD:%.*]] "add" +; CHECK-DAG: OpName [[ADDV:%.*]] "addv" +; CHECK-DAG: OpName [[SUB:%.*]] "sub" +; CHECK-DAG: OpName [[SUBV:%.*]] "subv" +; CHECK-DAG: OpName [[MUL:%.*]] "mul" +; CHECK-DAG: OpName [[MULV:%.*]] "mulv" +; CHECK-DAG: OpName [[DIV:%.*]] "div" +; CHECK-DAG: OpName [[DIVV:%.*]] "divv" +; CHECK-DAG: OpName [[REM:%.*]] "rem" +; CHECK-DAG: OpName [[REMV:%.*]] "remv" +; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0 +; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 4 + +; CHECK-DAG: [[NEG]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOAT]] [[X]] +define spir_func bfloat @neg(bfloat %x) { +entry: + %r = fneg bfloat %x + ret bfloat %r +} + +; CHECK-DAG: [[NEGV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOATV]] [[X]] +define spir_func <4 x bfloat> @negv(<4 x bfloat> %x) { +entry: + %r = fneg <4 x bfloat> %x + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[ADD]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @add(bfloat %x, bfloat %y) { +entry: + %r = fadd bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[ADDV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @addv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fadd <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[SUB]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @sub(bfloat %x, bfloat %y) { +entry: + %r = fsub bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[SUBV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @subv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fsub <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[MUL]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @mul(bfloat %x, bfloat %y) { +entry: + %r = fmul bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[MULV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @mulv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fmul <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[DIV]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @div(bfloat %x, bfloat %y) { +entry: + %r = fdiv bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[DIVV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @divv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = fdiv <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} + +; CHECK-DAG: [[REM]] = OpFunction [[BFLOAT]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOAT]] [[X]] [[Y]] +define spir_func bfloat @rem(bfloat %x, bfloat %y) { +entry: + %r = frem bfloat %x, %y + ret bfloat %r +} + +; CHECK-DAG: [[REMV]] = OpFunction [[BFLOATV]] +; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOATV]] [[X]] [[Y]] +define spir_func <4 x bfloat> @remv(<4 x bfloat> %x, <4 x bfloat> %y) { +entry: + %r = frem <4 x bfloat> %x, %y + ret <4 x bfloat> %r +} diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll new file mode 100644 index 0000000000000..3774791d58f87 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll @@ -0,0 +1,376 @@ +; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s +; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %} + +; CHECK-ERROR: LLVM ERROR: Relational instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic + +; CHECK-DAG: OpCapability BFloat16TypeKHR +; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL +; CHECK-DAG: OpExtension "SPV_KHR_bfloat16" +; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic" +; CHECK-DAG: OpName [[UEQ:%.*]] "test_ueq" +; CHECK-DAG: OpName [[OEQ:%.*]] "test_oeq" +; CHECK-DAG: OpName [[UNE:%.*]] "test_une" +; CHECK-DAG: OpName [[ONE:%.*]] "test_one" +; CHECK-DAG: OpName [[ULT:%.*]] "test_ult" +; CHECK-DAG: OpName [[OLT:%.*]] "test_olt" +; CHECK-DAG: OpName [[ULE:%.*]] "test_ule" +; CHECK-DAG: OpName [[OLE:%.*]] "test_ole" +; CHECK-DAG: OpName [[UGT:%.*]] "test_ugt" +; CHECK-DAG: OpName [[OGT:%.*]] "test_ogt" +; CHECK-DAG: OpName [[UGE:%.*]] "test_uge" +; CHECK-DAG: OpName [[OGE:%.*]] "test_oge" +; CHECK-DAG: OpName [[UNO:%.*]] "test_uno" +; CHECK-DAG: OpName [[ORD:%.*]] "test_ord" +; CHECK-DAG: OpName [[v3UEQ:%.*]] "test_v3_ueq" +; CHECK-DAG: OpName [[v3OEQ:%.*]] "test_v3_oeq" +; CHECK-DAG: OpName [[v3UNE:%.*]] "test_v3_une" +; CHECK-DAG: OpName [[v3ONE:%.*]] "test_v3_one" +; CHECK-DAG: OpName [[v3ULT:%.*]] "test_v3_ult" +; CHECK-DAG: OpName [[v3OLT:%.*]] "test_v3_olt" +; CHECK-DAG: OpName [[v3ULE:%.*]] "test_v3_ule" +; CHECK-DAG: OpName [[v3OLE:%.*]] "test_v3_ole" +; CHECK-DAG: OpName [[v3UGT:%.*]] "test_v3_ugt" +; CHECK-DAG: OpName [[v3OGT:%.*]] "test_v3_ogt" +; CHECK-DAG: OpName [[v3UGE:%.*]] "test_v3_uge" +; CHECK-DAG: OpName [[v3OGE:%.*]] "test_v3_oge" +; CHECK-DAG: OpName [[v3UNO:%.*]] "test_v3_uno" +; CHECK-DAG: OpName [[v3ORD:%.*]] "test_v3_ord" +; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0 +; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 3 + +; CHECK: [[UEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ueq(bfloat %a, bfloat %b) { + %r = fcmp ueq bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_oeq(bfloat %a, bfloat %b) { + %r = fcmp oeq bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UNE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_une(bfloat %a, bfloat %b) { + %r = fcmp une bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ONE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_one(bfloat %a, bfloat %b) { + %r = fcmp one bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ULT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ult(bfloat %a, bfloat %b) { + %r = fcmp ult bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OLT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_olt(bfloat %a, bfloat %b) { + %r = fcmp olt bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ULE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ule(bfloat %a, bfloat %b) { + %r = fcmp ule bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OLE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ole(bfloat %a, bfloat %b) { + %r = fcmp ole bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ugt(bfloat %a, bfloat %b) { + %r = fcmp ugt bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ogt(bfloat %a, bfloat %b) { + %r = fcmp ogt bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_uge(bfloat %a, bfloat %b) { + %r = fcmp uge bfloat %a, %b + ret i1 %r +} + +; CHECK: [[OGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_oge(bfloat %a, bfloat %b) { + %r = fcmp oge bfloat %a, %b + ret i1 %r +} + +; CHECK: [[ORD]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_ord(bfloat %a, bfloat %b) { + %r = fcmp ord bfloat %a, %b + ret i1 %r +} + +; CHECK: [[UNO]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define i1 @test_uno(bfloat %a, bfloat %b) { + %r = fcmp uno bfloat %a, %b + ret i1 %r +} + +; CHECK: [[v3UEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ueq(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ueq <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OEQ]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_oeq(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp oeq <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UNE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_une(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp une <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ONE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_one(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp one <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ULT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ult(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ult <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OLT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_olt(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp olt <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ULE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ule(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ule <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OLE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ole(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ole <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ugt(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ugt <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OGT]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ogt(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ogt <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_uge(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp uge <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3OGE]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_oge(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp oge <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3ORD]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_ord(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp ord <3 x bfloat> %a, %b + ret <3 x i1> %r +} + +; CHECK: [[v3UNO]] = OpFunction +; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]] +; CHECK-NEXT: OpLabel +; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]] +; CHECK-NEXT: OpReturnValue [[R]] +; CHECK-NEXT: OpFunctionEnd +define <3 x i1> @test_v3_uno(<3 x bfloat> %a, <3 x bfloat> %b) { + %r = fcmp uno <3 x bfloat> %a, %b + ret <3 x i1> %r +} diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll new file mode 100644 index 0000000000000..717771c965496 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll @@ -0,0 +1,32 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s +; %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - -filetype=obj | spirv-val %} +; %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability KernelAttributesINTEL +; CHECK: OpExtension "SPV_INTEL_kernel_attributes" +; CHECK: OpEntryPoint {{.*}} %[[DIM1:[0-9]+]] "Dim1" +; CHECK: OpEntryPoint {{.*}} %[[DIM2:[0-9]+]] "Dim2" +; CHECK: OpEntryPoint {{.*}} %[[DIM3:[0-9]+]] "Dim3" +; CHECK: OpExecutionMode %[[DIM1]] MaxWorkgroupSizeINTEL 4 1 1 +; CHECK: OpExecutionMode %[[DIM2]] MaxWorkgroupSizeINTEL 8 4 1 +; CHECK: OpExecutionMode %[[DIM3]] MaxWorkgroupSizeINTEL 16 8 4 +; CHECK: %[[DIM1]] = OpFunction +; CHECK: %[[DIM2]] = OpFunction +; CHECK: %[[DIM3]] = OpFunction + +define spir_kernel void @Dim1() !max_work_group_size !0 { + ret void +} + +define spir_kernel void @Dim2() !max_work_group_size !1 { + ret void +} + +define spir_kernel void @Dim3() !max_work_group_size !2 { + ret void +} + +!0 = !{i32 4} +!1 = !{i32 8, i32 4} +!2 = !{i32 16, i32 8, i32 4} diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll index f745794e11de1..15905dd1894e2 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll @@ -1,4 +1,5 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s define i6 @getConstantI6() { ret i6 2 diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll index afffd9e69b454..11e7d006c5ecf 100644 --- a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll +++ b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll @@ -1,4 +1,6 @@ ; REQUIRES: spirv-tools ; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck --check-prefix=AMDGCNSPIRV %s ; CHECK: Generator: {{.*}}{{43|LLVM SPIR-V Backend}}{{.*}} +; AMDGCNSPIRV: Generator: {{.*}}{{65535|LLVM SPIR-V Backend}}{{.*}} diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll index 686c1e97257ad..49ee9931d1126 100644 --- a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll +++ b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll @@ -6,6 +6,7 @@ ; RUN: llc -O0 -mtriple=spirv64v1.4-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV14 ; RUN: llc -O0 -mtriple=spirv64v1.5-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV15 ; RUN: llc -O0 -mtriple=spirv64v1.6-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV16 +; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=AMDGCNSPIRV ; CHECK-SPIRV10: Version: 1.0 ; CHECK-SPIRV11: Version: 1.1 @@ -14,3 +15,4 @@ ; CHECK-SPIRV14: Version: 1.4 ; CHECK-SPIRV15: Version: 1.5 ; CHECK-SPIRV16: Version: 1.6 +; AMDGCNSPIRV: Version: 1.6 diff --git a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll index 73c46b18bfa78..c9b2968a4aed7 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll @@ -10,6 +10,7 @@ ; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0 ; CHECK-DAG: %[[#Half:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#Float:]] = OpTypeFloat 32 ; CHECK-DAG: %[[#Struct:]] = OpTypeStruct %[[#Half]] ; CHECK-DAG: %[[#Void:]] = OpTypeVoid ; CHECK-DAG: %[[#PtrInt8:]] = OpTypePointer CrossWorkgroup %[[#Int8:]] @@ -17,12 +18,20 @@ ; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0 ; CHECK-DAG: %[[#PtrInt64:]] = OpTypePointer CrossWorkgroup %[[#Int64]] ; CHECK-DAG: %[[#BarType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt64]] %[[#Struct]] +; CHECK-DAG: %[[#BazType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt8]] %[[#Struct]] %[[#Int8]] %[[#Struct]] %[[#Float]] %[[#Struct]] ; CHECK: OpFunction %[[#Void]] None %[[#FooType]] ; CHECK: OpFunctionParameter %[[#PtrInt8]] ; CHECK: OpFunctionParameter %[[#Struct]] ; CHECK: OpFunction %[[#Void]] None %[[#BarType]] ; CHECK: OpFunctionParameter %[[#PtrInt64]] ; CHECK: OpFunctionParameter %[[#Struct]] +; CHECK: OpFunction %[[#Void]] None %[[#BazType]] +; CHECK: OpFunctionParameter %[[#PtrInt8]] +; CHECK: OpFunctionParameter %[[#Struct]] +; CHECK: OpFunctionParameter %[[#Int8]] +; CHECK: OpFunctionParameter %[[#Struct]] +; CHECK: OpFunctionParameter %[[#Float]] +; CHECK: OpFunctionParameter %[[#Struct]] %t_half = type { half } @@ -38,4 +47,9 @@ entry: ret void } +define spir_kernel void @baz(ptr addrspace(1) %a, %t_half %b, i8 %c, %t_half %d, float %e, %t_half %f) { +entry: + ret void +} + declare spir_func %t_half @_Z29__spirv_SpecConstantComposite(half) diff --git a/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll new file mode 100644 index 0000000000000..6a9ce4515f5c0 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll @@ -0,0 +1,18 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: [[SET:%.*]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: [[UINT:%.*]] = OpTypeInt 32 0 +; CHECK-DAG: [[FLOAT:%.*]] = OpTypeFloat 32 +; CHECK-DAG: [[FLOAT2:%.*]] = OpTypeVector [[FLOAT]] 2 + +; CHECK: [[P0:%.*]] = OpFunctionParameter [[UINT]] +; CHECK: [[UNPACK2:%.*]] = OpExtInst [[FLOAT2]] [[SET]] UnpackHalf2x16 [[P0]] +; CHECK: [[UNPACK:%.*]] = OpCompositeExtract [[FLOAT]] [[UNPACK2]] 0 +; CHECK: OpReturnValue [[UNPACK]] +define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 { + %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0) + %3 = extractelement <2 x float> %2, i64 0 + ret float %3 +} + diff --git a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll index 678d9a9073155..ff9b6a34c1d53 100644 --- a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll +++ b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll @@ -22,10 +22,10 @@ define void @main(i16 %in) { ; CHECK-NEXT: locghile %r3, 1 ; CHECK-NEXT: o %r0, 0(%r1) ; CHECK-NEXT: larl %r1, g_222 -; CHECK-NEXT: lghi %r5, 0 ; CHECK-NEXT: dsgfr %r2, %r0 +; CHECK-NEXT: lghi %r3, 0 ; CHECK-NEXT: stgrl %r2, g_39 -; CHECK-NEXT: stc %r5, 19(%r1) +; CHECK-NEXT: stc %r3, 19(%r1) ; CHECK-NEXT: br %r14 %tmp = load i32, ptr @g_151, align 4 %tmp3 = or i32 %tmp, 1 diff --git a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll index 9c638199bb6e6..1cfda8a821bd6 100644 --- a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll +++ b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll @@ -10,7 +10,7 @@ declare i32 @has_ptr_arg(ptr) ; CHECK-LABEL: test_invalid_rtn: ; CHECK: i32.const $push[[L0:[0-9]+]]=, 0{{$}} -; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.2, $pop[[L0]]{{$}} +; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.1, $pop[[L0]]{{$}} ; CHECK-NEXT: drop $pop[[L1]]{{$}} ; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, 0{{$}} ; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid, $pop[[L0]]{{$}} @@ -32,7 +32,7 @@ define void @test_struct_rtn() { ; CHECK-LABEL: test_invalid_arg: ; CHECK: i32.const $push[[L0:[0-9]+]]=, 2{{$}} -; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.4, $pop[[L0]]{{$}} +; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.2, $pop[[L0]]{{$}} ; CHECK-NEXT: drop $pop[[L1]]{{$}} ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 2{{$}} ; CHECK-NEXT: call $push[[L1:[0-9]+]]=, has_ptr_arg, $pop[[L0]]{{$}} @@ -54,8 +54,8 @@ entry: ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function -; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.2: -; CHECK-NEXT: .functype .Lhas_i64_arg_bitcast_invalid.2 (i32) -> (i32) +; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.1: +; CHECK-NEXT: .functype .Lhas_i64_arg_bitcast_invalid.1 (i32) -> (i32) ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function @@ -64,7 +64,7 @@ entry: ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function -; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.4: -; CHECK-NEXT: .functype .Lhas_ptr_arg_bitcast_invalid.4 (i32) -> (i32) +; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.2: +; CHECK-NEXT: .functype .Lhas_ptr_arg_bitcast_invalid.2 (i32) -> (i32) ; CHECK-NEXT: unreachable ; CHECK-NEXT: end_function diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll index 87059c5d474e6..6ae7b2260c15c 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 { ; CHECK-LABEL: @test_no_bitcast( diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll index 5fb2dcdc1d621..ca7c3573a3294 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s +; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) { ; CHECK-LABEL: @test_amx_load_non_O0( diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir index 41e1b5bf22bf1..5c059a4e0539d 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir @@ -1,5 +1,6 @@ -# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 -# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X86 +# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X64 --- | @@ -30,24 +31,23 @@ ... --- name: test_copy -# ALL-LABEL: name: test_copy alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } -# ALL: %0:gr8 = COPY $al -# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0 -# ALL-NEXT: $eax = COPY %1 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al + ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s8) = COPY $al %1(s32) = G_ZEXT %0(s8) $eax = COPY %1(s32) @@ -56,24 +56,23 @@ body: | ... --- name: test_copy2 -# ALL-LABEL: name: test_copy2 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } -# ALL: %0:gr8 = COPY $al -# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0 -# ALL-NEXT: $eax = COPY %1 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy2 + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al + ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s8) = COPY $al %1(s32) = G_ZEXT %0(s8) $eax = COPY %1(s32) @@ -82,30 +81,35 @@ body: | ... --- name: test_copy3 -# ALL-LABEL: name: test_copy3 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr16[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] } -# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] } -# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr16 = COPY $ax -# X32-NEXT: %3:gr16_abcd = COPY %0 -# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit -# X64-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; X86-LABEL: name: test_copy3 + ; X86: liveins: $eax + ; X86-NEXT: {{ $}} + ; X86-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax + ; X86-NEXT: [[COPY1:%[0-9]+]]:gr16_abcd = COPY [[COPY]] + ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit + ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]] + ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X86-NEXT: RET 0, implicit $eax + ; + ; X64-LABEL: name: test_copy3 + ; X64: liveins: $eax + ; X64-NEXT: {{ $}} + ; X64-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax + ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit + ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]] + ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X64-NEXT: RET 0, implicit $eax %0(s16) = COPY $ax %1(s8) = G_TRUNC %0(s16) %2(s32) = G_ZEXT %1(s8) @@ -115,27 +119,25 @@ body: | ... --- name: test_copy4 -# ALL-LABEL: name: test_copy4 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $eax -# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit -# ALL-NEXT: %2:gr32 = MOVZX32rr16 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax + ; CHECK-LABEL: name: test_copy4 + ; CHECK: liveins: $eax + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $eax + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[COPY1]] + ; CHECK-NEXT: $eax = COPY [[MOVZX32rr16_]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s32) = COPY $eax %1(s16) = G_TRUNC %0(s32) %2(s32) = G_ZEXT %1(s16) @@ -145,30 +147,35 @@ body: | ... --- name: test_copy5 -# ALL-LABEL: name: test_copy5 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] } -# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] } -# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $edx -# X32-NEXT: %3:gr32_abcd = COPY %0 -# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit -# X64-NEXT: %1:gr8 = COPY %0.sub_8bit -# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1 -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax,$edx + ; X86-LABEL: name: test_copy5 + ; X86: liveins: $eax, $edx + ; X86-NEXT: {{ $}} + ; X86-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; X86-NEXT: [[COPY1:%[0-9]+]]:gr32_abcd = COPY [[COPY]] + ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit + ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]] + ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X86-NEXT: RET 0, implicit $eax + ; + ; X64-LABEL: name: test_copy5 + ; X64: liveins: $eax, $edx + ; X64-NEXT: {{ $}} + ; X64-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit + ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]] + ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]] + ; X64-NEXT: RET 0, implicit $eax %0(s32) = COPY $edx %1(s8) = G_TRUNC %0(s32) %2(s32) = G_ANYEXT %1(s8) @@ -178,29 +185,26 @@ body: | ... --- name: test_copy6 -# ALL-LABEL: name: test_copy6 alignment: 16 legalized: true regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 2, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] } -# ALL-NEXT: - { id: 3, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] } registers: - { id: 0, class: gpr, preferred-register: '' } - { id: 1, class: gpr, preferred-register: '' } - { id: 2, class: gpr, preferred-register: '' } -# ALL: %0:gr32 = COPY $edx -# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit -# ALL-NEXT: %3:low32_addr_access_rbp = IMPLICIT_DEF -# ALL-NEXT: %2:low32_addr_access_rbp = INSERT_SUBREG %3, %1, %subreg.sub_16bit -# ALL-NEXT: $eax = COPY %2 -# ALL-NEXT: RET 0, implicit $eax body: | bb.1 (%ir-block.0): liveins: $eax,$edx + ; CHECK-LABEL: name: test_copy6 + ; CHECK: liveins: $eax, $edx + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit + ; CHECK-NEXT: [[DEF:%[0-9]+]]:low32_addr_access_rbp = IMPLICIT_DEF + ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:low32_addr_access_rbp = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.sub_16bit + ; CHECK-NEXT: $eax = COPY [[INSERT_SUBREG]] + ; CHECK-NEXT: RET 0, implicit $eax %0(s32) = COPY $edx %1(s16) = G_TRUNC %0(s32) %2(s32) = G_ANYEXT %1(s16) diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 8007d9dcf13bc..32d225273a6e1 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -203,24 +203,14 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %edx -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: shll %cl, %edx -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB5_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edx, %esi -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: .LBB5_2: -; X86-NEXT: andl 4(%eax), %esi -; X86-NEXT: andl (%eax), %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: setne %al -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $32, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: test_ne_i64: @@ -242,38 +232,20 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB6_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB6_2: -; X86-NEXT: movl (%edx), %ecx -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: andl %esi, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: andl %eax, %ebp -; X86-NEXT: xorl %esi, %edi -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: setne %al -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: complement_ne_i64: @@ -300,40 +272,20 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB7_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %edi -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: .LBB7_2: -; X86-NEXT: movl (%edx), %eax -; X86-NEXT: movl 4(%edx), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: notl %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: andl %esi, %ebp -; X86-NEXT: notl %esi -; X86-NEXT: andl %ecx, %edi -; X86-NEXT: andl %eax, %esi -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: sete %al -; X86-NEXT: movl %esi, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: reset_eq_i64: @@ -361,38 +313,20 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: shll %cl, %eax -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB8_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB8_2: -; X86-NEXT: movl (%edx), %ecx -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: andl %esi, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: andl %eax, %ebp -; X86-NEXT: orl %esi, %edi -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: setne %al -; X86-NEXT: movl %ecx, (%edx) -; X86-NEXT: movl %edi, 4(%edx) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: set_ne_i64: @@ -419,52 +353,26 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-LABEL: init_eq_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $1, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: testb $32, %cl -; X86-NEXT: je .LBB9_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl $0, %eax -; X86-NEXT: .LBB9_2: -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: notl %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: notl %ebp -; X86-NEXT: je .LBB9_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: movl %esi, %edi -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: .LBB9_4: +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: andl %ecx, %ebx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl $32, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx ; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl (%edi), %ecx -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: andl %ecx, %ebp -; X86-NEXT: orl %esi, %ebp -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %ebp, (%edi) -; X86-NEXT: movl %ebx, 4(%edi) -; X86-NEXT: sete %al +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: init_eq_i64: @@ -516,101 +424,25 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $48, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, (%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %esi -; X86-NEXT: movl 24(%esp,%esi), %edi -; X86-NEXT: movl 28(%esp,%esi), %eax -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl 16(%esp,%esi), %edx -; X86-NEXT: movl 20(%esp,%esi), %esi -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: andl 8(%ebx), %edi -; X86-NEXT: andl (%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: andl 12(%ebx), %eax -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $96, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al ; X86-NEXT: retl ; -; SSE-LABEL: test_ne_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %rsi, %rax -; SSE-NEXT: andq 8(%rdi), %rdx -; SSE-NEXT: andq (%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: setne %al -; SSE-NEXT: retq -; -; AVX2-LABEL: test_ne_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: movl $1, %edx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shldq %cl, %rdx, %rsi -; AVX2-NEXT: shlxq %rcx, %rdx, %rdx -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rdx, %rsi -; AVX2-NEXT: cmovneq %rax, %rdx -; AVX2-NEXT: andq 8(%rdi), %rsi -; AVX2-NEXT: andq (%rdi), %rdx -; AVX2-NEXT: orq %rsi, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_ne_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl $1, %eax -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shlxq %rcx, %rax, %rax -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rax, %rdx -; AVX512-NEXT: cmovneq %rsi, %rax -; AVX512-NEXT: andq 8(%rdi), %rdx -; AVX512-NEXT: andq (%rdi), %rax -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: setne %al -; AVX512-NEXT: retq +; X64-LABEL: test_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $96, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -623,124 +455,33 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %esi -; X86-NEXT: movl 60(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %edi -; X86-NEXT: movl 52(%esp,%eax), %ebx -; X86-NEXT: shldl %cl, %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl 8(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 12(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: complement_ne_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: xorq %rcx, %rsi -; SSE-NEXT: xorq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: setne %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) -; SSE-NEXT: retq -; -; AVX-LABEL: complement_ne_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: andq %rsi, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: andq %rdx, %r9 -; AVX-NEXT: xorq %rcx, %rsi -; AVX-NEXT: xorq %rax, %rdx -; AVX-NEXT: orq %r8, %r9 -; AVX-NEXT: setne %al -; AVX-NEXT: movq %rdx, (%rdi) -; AVX-NEXT: movq %rsi, 8(%rdi) -; AVX-NEXT: retq +; X64-LABEL: complement_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btcl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -755,124 +496,33 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %edx -; X86-NEXT: movl 60(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %esi -; X86-NEXT: movl 52(%esp,%eax), %edi -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl 8(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl (%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl 4(%ebx), %ebx -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl %ebx, %ecx -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: movl %edx, 8(%edi) -; X86-NEXT: movl %eax, 12(%edi) -; X86-NEXT: movl %esi, (%edi) -; X86-NEXT: movl %ecx, 4(%edi) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: reset_eq_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 -; SSE-NEXT: notq %rsi -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: notq %rdx -; SSE-NEXT: andq %rcx, %rsi -; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: sete %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) -; SSE-NEXT: retq -; -; AVX-LABEL: reset_eq_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: andnq %rcx, %rsi, %r8 -; AVX-NEXT: andq %rsi, %rcx -; AVX-NEXT: andnq %rax, %rdx, %rsi -; AVX-NEXT: andq %rdx, %rax -; AVX-NEXT: orq %rcx, %rax -; AVX-NEXT: sete %al -; AVX-NEXT: movq %rsi, (%rdi) -; AVX-NEXT: movq %r8, 8(%rdi) -; AVX-NEXT: retq +; X64-LABEL: reset_eq_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setae %al +; X64-NEXT: btrl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -888,350 +538,96 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: set_ne_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btsl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %ld = load i128, ptr %word + %test = and i128 %ld, %bit + %res = or i128 %ld, %bit + %cmp = icmp ne i128 %test, 0 + store i128 %res, ptr %word + ret i1 %cmp +} + +define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { +; X86-LABEL: init_eq_i128: +; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %esi -; X86-NEXT: movl 60(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %edi -; X86-NEXT: movl 52(%esp,%eax), %ebx -; X86-NEXT: shldl %cl, %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl 8(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 12(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: set_ne_i128: +; SSE-LABEL: init_eq_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %edx -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: shldq %cl, %rdx, %rsi -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rdx, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq 8(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r8 -; SSE-NEXT: andq %rsi, %r8 -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: andq %rdx, %r9 -; SSE-NEXT: orq %rcx, %rsi -; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: setne %al -; SSE-NEXT: movq %rdx, (%rdi) -; SSE-NEXT: movq %rsi, 8(%rdi) +; SSE-NEXT: andl $96, %esi +; SSE-NEXT: shrl $3, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d +; SSE-NEXT: setae %al +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: set_ne_i128: +; AVX-LABEL: init_eq_i128: ; AVX: # %bb.0: ; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: movl $1, %edx -; AVX-NEXT: xorl %esi, %esi -; AVX-NEXT: shldq %cl, %rdx, %rsi -; AVX-NEXT: shlxq %rcx, %rdx, %rdx -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %rdx, %rsi -; AVX-NEXT: cmovneq %rax, %rdx -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq 8(%rdi), %rcx -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: andq %rsi, %r8 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: andq %rdx, %r9 -; AVX-NEXT: orq %rcx, %rsi -; AVX-NEXT: orq %rax, %rdx -; AVX-NEXT: orq %r8, %r9 -; AVX-NEXT: setne %al -; AVX-NEXT: movq %rdx, (%rdi) -; AVX-NEXT: movq %rsi, 8(%rdi) +; AVX-NEXT: andl $96, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) ; AVX-NEXT: retq - %rem = and i32 %position, 127 - %ofs = zext nneg i32 %rem to i128 - %bit = shl nuw i128 1, %ofs - %ld = load i128, ptr %word - %test = and i128 %ld, %bit - %res = or i128 %ld, %bit - %cmp = icmp ne i128 %test, 0 - store i128 %res, ptr %word - ret i1 %cmp -} - -define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { -; X86-LABEL: init_eq_i128: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $128, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movzbl 16(%ebp), %eax -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrb $3, %dl -; X86-NEXT: andb $12, %dl -; X86-NEXT: negb %dl -; X86-NEXT: movsbl %dl, %esi -; X86-NEXT: movl 64(%esp,%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esp,%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 72(%esp,%esi), %ebx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 76(%esp,%esi), %edi -; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %edi, %esi -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl 12(%ecx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl 4(%ecx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 100(%esp,%ecx), %edi -; X86-NEXT: movl 104(%esp,%ecx), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: shldl %cl, %edi, %ebx -; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 108(%esp,%ebx), %ebx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 96(%esp,%ebx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edi -; X86-NEXT: orl %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 8(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %eax, (%ecx) -; X86-NEXT: movl %edx, 4(%ecx) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: init_eq_i128: -; SSE: # %bb.0: -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %esi -; SSE-NEXT: xorl %r8d, %r8d -; SSE-NEXT: shldq %cl, %rsi, %r8 -; SSE-NEXT: shlq %cl, %rsi -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: xorl %r9d, %r9d -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rsi, %r8 -; SSE-NEXT: cmovneq %r9, %rsi -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %r9, %rax -; SSE-NEXT: movq (%rdi), %rcx -; SSE-NEXT: movq 8(%rdi), %r9 -; SSE-NEXT: movq %r9, %r10 -; SSE-NEXT: andq %r8, %r10 -; SSE-NEXT: notq %r8 -; SSE-NEXT: movq %rcx, %r11 -; SSE-NEXT: andq %rsi, %r11 -; SSE-NEXT: notq %rsi -; SSE-NEXT: andq %r9, %r8 -; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq %rcx, %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: orq %r10, %r11 -; SSE-NEXT: sete %al -; SSE-NEXT: movq %rsi, (%rdi) -; SSE-NEXT: movq %r8, 8(%rdi) -; SSE-NEXT: retq -; -; AVX2-LABEL: init_eq_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %esi -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: xorl %r8d, %r8d -; AVX2-NEXT: movl %edx, %edx -; AVX2-NEXT: xorl %r9d, %r9d -; AVX2-NEXT: shldq %cl, %rdx, %r9 -; AVX2-NEXT: shlxq %rcx, %rsi, %rsi -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rsi, %rax -; AVX2-NEXT: cmovneq %r8, %rsi -; AVX2-NEXT: shlxq %rcx, %rdx, %rcx -; AVX2-NEXT: cmovneq %rcx, %r9 -; AVX2-NEXT: cmovneq %r8, %rcx -; AVX2-NEXT: movq (%rdi), %rdx -; AVX2-NEXT: movq 8(%rdi), %r8 -; AVX2-NEXT: andnq %r8, %rax, %r10 -; AVX2-NEXT: andq %rax, %r8 -; AVX2-NEXT: andnq %rdx, %rsi, %r11 -; AVX2-NEXT: andq %rsi, %rdx -; AVX2-NEXT: orq %r9, %r10 -; AVX2-NEXT: orq %rcx, %r11 -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: sete %al -; AVX2-NEXT: movq %r11, (%rdi) -; AVX2-NEXT: movq %r10, 8(%rdi) -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movl $1, %esi -; AVX512-NEXT: xorl %r8d, %r8d -; AVX512-NEXT: shldq %cl, %rsi, %r8 -; AVX512-NEXT: shlxq %rcx, %rsi, %rsi -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: xorl %r9d, %r9d -; AVX512-NEXT: shldq %cl, %rdx, %r9 -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rsi, %r8 -; AVX512-NEXT: cmovneq %rax, %rsi -; AVX512-NEXT: shlxq %rcx, %rdx, %rcx -; AVX512-NEXT: cmovneq %rcx, %r9 -; AVX512-NEXT: cmovneq %rax, %rcx -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %rdx -; AVX512-NEXT: andnq %rdx, %r8, %r10 -; AVX512-NEXT: andq %r8, %rdx -; AVX512-NEXT: andnq %rax, %rsi, %r8 -; AVX512-NEXT: andq %rsi, %rax -; AVX512-NEXT: orq %r9, %r10 -; AVX512-NEXT: orq %rcx, %r8 -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: sete %al -; AVX512-NEXT: movq %r8, (%rdi) -; AVX512-NEXT: movq %r10, 8(%rdi) -; AVX512-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -1252,344 +648,25 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: test_ne_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $224, %esp -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %eax -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: andl 40(%ebx), %eax -; X86-NEXT: andl 8(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 56(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 24(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: andl 44(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 12(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 60(%edi), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 28(%edi), %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: negl %edx -; X86-NEXT: movl 192(%esp,%edx), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: andl 32(%ebx), %ecx -; X86-NEXT: andl (%ebx), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: andl 16(%ebx), %edi -; X86-NEXT: andl 48(%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 36(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 4(%ebx), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 20(%ebx), %ecx -; X86-NEXT: andl 52(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $60, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al ; X86-NEXT: retl ; -; SSE-LABEL: test_ne_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq -48(%rsp,%rbx), %rdx -; SSE-NEXT: movq -40(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq -16(%rsp,%rbx), %r11 -; SSE-NEXT: movq -8(%rsp,%rbx), %r10 -; SSE-NEXT: shldq %cl, %r11, %r10 -; SSE-NEXT: movq -32(%rsp,%rbx), %r9 -; SSE-NEXT: movq -24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r8 -; SSE-NEXT: shldq %cl, %r9, %r8 -; SSE-NEXT: movq -56(%rsp,%rbx), %rsi -; SSE-NEXT: shldq %cl, %rsi, %rdx -; SSE-NEXT: shldq %cl, %r15, %r11 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -64(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %rsi -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: andq 32(%rdi), %r9 -; SSE-NEXT: andq 48(%rdi), %r11 -; SSE-NEXT: andq 16(%rdi), %rdx -; SSE-NEXT: orq %r11, %rdx -; SSE-NEXT: andq 40(%rdi), %r8 -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: andq 24(%rdi), %rax -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: andq (%rdi), %rbx -; SSE-NEXT: orq %r9, %rbx -; SSE-NEXT: orq %rdx, %rbx -; SSE-NEXT: andq 8(%rdi), %rsi -; SSE-NEXT: orq %r8, %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: orq %rbx, %rsi -; SSE-NEXT: setne %al -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: retq -; -; AVX2-LABEL: test_ne_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rsi -; AVX2-NEXT: movq -48(%rsp,%rsi), %rdx -; AVX2-NEXT: movq -40(%rsp,%rsi), %rbx -; AVX2-NEXT: movq %rbx, %rax -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq -16(%rsp,%rsi), %r11 -; AVX2-NEXT: movq -8(%rsp,%rsi), %r10 -; AVX2-NEXT: shldq %cl, %r11, %r10 -; AVX2-NEXT: movq -32(%rsp,%rsi), %r9 -; AVX2-NEXT: movq -24(%rsp,%rsi), %r14 -; AVX2-NEXT: movq %r14, %r8 -; AVX2-NEXT: shldq %cl, %r9, %r8 -; AVX2-NEXT: movq -64(%rsp,%rsi), %r15 -; AVX2-NEXT: movq -56(%rsp,%rsi), %rsi -; AVX2-NEXT: shldq %cl, %rsi, %rdx -; AVX2-NEXT: shldq %cl, %r14, %r11 -; AVX2-NEXT: shldq %cl, %rbx, %r9 -; AVX2-NEXT: shldq %cl, %r15, %rsi -; AVX2-NEXT: shlxq %rcx, %r15, %rcx -; AVX2-NEXT: andq 32(%rdi), %r9 -; AVX2-NEXT: andq 48(%rdi), %r11 -; AVX2-NEXT: andq 16(%rdi), %rdx -; AVX2-NEXT: andq 40(%rdi), %r8 -; AVX2-NEXT: andq 56(%rdi), %r10 -; AVX2-NEXT: andq 24(%rdi), %rax -; AVX2-NEXT: orq %r11, %rdx -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: andq (%rdi), %rcx -; AVX2-NEXT: orq %r9, %rcx -; AVX2-NEXT: orq %rdx, %rcx -; AVX2-NEXT: andq 8(%rdi), %rsi -; AVX2-NEXT: orq %r8, %rsi -; AVX2-NEXT: orq %rax, %rsi -; AVX2-NEXT: orq %rcx, %rsi -; AVX2-NEXT: setne %al -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq -48(%rsp,%rbx), %rdx -; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %rax -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq -16(%rsp,%rbx), %r11 -; AVX512-NEXT: movq -8(%rsp,%rbx), %r10 -; AVX512-NEXT: shldq %cl, %r11, %r10 -; AVX512-NEXT: movq -32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 -; AVX512-NEXT: movq %r15, %r8 -; AVX512-NEXT: shldq %cl, %r9, %r8 -; AVX512-NEXT: movq -56(%rsp,%rbx), %rsi -; AVX512-NEXT: shldq %cl, %rsi, %rdx -; AVX512-NEXT: shldq %cl, %r15, %r11 -; AVX512-NEXT: shldq %cl, %r14, %r9 -; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx -; AVX512-NEXT: shldq %cl, %rbx, %rsi -; AVX512-NEXT: shlxq %rcx, %rbx, %rcx -; AVX512-NEXT: andq 32(%rdi), %r9 -; AVX512-NEXT: andq 48(%rdi), %r11 -; AVX512-NEXT: andq 16(%rdi), %rdx -; AVX512-NEXT: andq 40(%rdi), %r8 -; AVX512-NEXT: andq 56(%rdi), %r10 -; AVX512-NEXT: andq 24(%rdi), %rax -; AVX512-NEXT: orq %r11, %rdx -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: andq (%rdi), %rcx -; AVX512-NEXT: orq %r9, %rcx -; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: andq 8(%rdi), %rsi -; AVX512-NEXT: orq %r8, %rsi -; AVX512-NEXT: orq %rax, %rsi -; AVX512-NEXT: orq %rcx, %rsi -; AVX512-NEXT: setne %al -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; X64-LABEL: test_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: andl $60, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs @@ -1602,572 +679,33 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $272, %esp # imm = 0x110 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl 56(%edx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl 24(%edx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 12(%eax), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl 60(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 28(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 240(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 32(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 16(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl %esi, %edi -; X86-NEXT: movl 52(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl %ebx, 60(%edx) -; X86-NEXT: movl %edi, 56(%edx) -; X86-NEXT: movl %ecx, 52(%edx) -; X86-NEXT: movl %esi, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 4(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btcl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: complement_ne_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq (%rsp,%rbx), %rsi -; SSE-NEXT: movq 8(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 32(%rsp,%rbx), %r8 -; SSE-NEXT: movq 40(%rsp,%rbx), %rbp -; SSE-NEXT: shldq %cl, %r8, %rbp -; SSE-NEXT: movq 16(%rsp,%rbx), %r9 -; SSE-NEXT: movq 24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r10 -; SSE-NEXT: shldq %cl, %r9, %r10 -; SSE-NEXT: movq -8(%rsp,%rbx), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -16(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: movq 24(%rdi), %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 16(%rdi), %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %r8, %r13 -; SSE-NEXT: andq %rsi, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %rcx, %r13 -; SSE-NEXT: andq %rbp, %r13 -; SSE-NEXT: andq %rax, %r15 -; SSE-NEXT: orq %r13, %r15 -; SSE-NEXT: movq 32(%rdi), %r14 -; SSE-NEXT: movq %r14, %rcx -; SSE-NEXT: andq %r9, %rcx -; SSE-NEXT: movq (%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rbx, %r13 -; SSE-NEXT: orq %rcx, %r13 -; SSE-NEXT: orq %r12, %r13 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r12 -; SSE-NEXT: andq %r10, %r12 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: andq %r11, %rax -; SSE-NEXT: orq %r12, %rax -; SSE-NEXT: orq %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: xorq %rcx, %r10 -; SSE-NEXT: xorq %r14, %r9 -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; SSE-NEXT: xorq %rdx, %r11 -; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: movq %r8, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r9, 32(%rdi) -; SSE-NEXT: movq %r10, 40(%rdi) -; SSE-NEXT: movq %rsi, 16(%rdi) -; SSE-NEXT: movq %r15, 24(%rdi) -; SSE-NEXT: movq %rbx, (%rdi) -; SSE-NEXT: movq %r11, 8(%rdi) -; SSE-NEXT: setne %al -; SSE-NEXT: addq $56, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: complement_ne_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $72, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, (%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rbx -; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX2-NEXT: movq %rbp, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX2-NEXT: shldq %cl, %r8, %r13 -; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX2-NEXT: movq %r14, %r10 -; AVX2-NEXT: shldq %cl, %r9, %r10 -; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX2-NEXT: shldq %cl, %r11, %rsi -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r8, %r14 -; AVX2-NEXT: andq %rsi, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq 56(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r13, %r15 -; AVX2-NEXT: movq 24(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %rax, %r14 -; AVX2-NEXT: orq %r15, %r14 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq (%rsp,%rbx), %rdx -; AVX2-NEXT: movq 32(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r9, %r15 -; AVX2-NEXT: shlxq %rcx, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq (%rdi), %rbx -; AVX2-NEXT: movq %rbx, %rbp -; AVX2-NEXT: andq %rax, %rbp -; AVX2-NEXT: orq %r15, %rbp -; AVX2-NEXT: orq %r12, %rbp -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: andq %r10, %rcx -; AVX2-NEXT: movq 8(%rdi), %r15 -; AVX2-NEXT: movq %r15, %r12 -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: orq %rcx, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX2-NEXT: xorq %rax, %r10 -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT: xorq %r15, %r11 -; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: movq %r8, 48(%rdi) -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r9, 32(%rdi) -; AVX2-NEXT: movq %r10, 40(%rdi) -; AVX2-NEXT: movq %rsi, 16(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %rbx, (%rdi) -; AVX2-NEXT: movq %r11, 8(%rdi) -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $72, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: complement_ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $72, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, (%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX512-NEXT: movq %rbp, %rax -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX512-NEXT: shldq %cl, %r8, %r13 -; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %r10 -; AVX512-NEXT: shldq %cl, %r9, %r10 -; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX512-NEXT: shldq %cl, %r11, %rsi -; AVX512-NEXT: shldq %cl, %r14, %r8 -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r8, %r14 -; AVX512-NEXT: andq %rsi, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq 56(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r13, %r15 -; AVX512-NEXT: movq 24(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %rax, %r14 -; AVX512-NEXT: orq %r15, %r14 -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: movq (%rsp,%rbx), %rdx -; AVX512-NEXT: movq 32(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r9, %r15 -; AVX512-NEXT: shlxq %rcx, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq (%rdi), %rbx -; AVX512-NEXT: movq %rbx, %rbp -; AVX512-NEXT: andq %rax, %rbp -; AVX512-NEXT: orq %r15, %rbp -; AVX512-NEXT: orq %r12, %rbp -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rdx, %r11 -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andq %r10, %rcx -; AVX512-NEXT: movq 8(%rdi), %r15 -; AVX512-NEXT: movq %r15, %r12 -; AVX512-NEXT: andq %r11, %r12 -; AVX512-NEXT: orq %rcx, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: xorq %rax, %r10 -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT: xorq %r15, %r11 -; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: movq %r8, 48(%rdi) -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r9, 32(%rdi) -; AVX512-NEXT: movq %r10, 40(%rdi) -; AVX512-NEXT: movq %rsi, 16(%rdi) -; AVX512-NEXT: movq %rcx, 24(%rdi) -; AVX512-NEXT: movq %rbx, (%rdi) -; AVX512-NEXT: movq %r11, 8(%rdi) -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $72, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; X64-LABEL: complement_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btcl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs @@ -2182,606 +720,33 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $288, %esp # imm = 0x120 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-NEXT: subl %eax, %edi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 4(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edi), %eax -; X86-NEXT: andl $31, %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shldl %cl, %edx, %ebx -; X86-NEXT: movl 12(%edi), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edi), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edi), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edi), %esi -; X86-NEXT: movl %esi, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl 52(%edi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %esi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 56(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl 44(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%edi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 256(%esp,%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %esi, %edi -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: movl 32(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: orl %ebx, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: movl 52(%ebx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edx, 60(%eax) -; X86-NEXT: movl %esi, 56(%eax) -; X86-NEXT: movl %ecx, 52(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 44(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 40(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 36(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 32(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 28(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 24(%eax) -; X86-NEXT: movl %ebx, 20(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 16(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, 48(%eax) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: reset_eq_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rdx -; SSE-NEXT: movq (%rsp,%rdx), %r9 -; SSE-NEXT: movq 8(%rsp,%rdx), %r8 -; SSE-NEXT: movq %r8, %rsi -; SSE-NEXT: shldq %cl, %r9, %rsi -; SSE-NEXT: movq -8(%rsp,%rdx), %rax -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: movq 16(%rsp,%rdx), %r14 -; SSE-NEXT: movq 24(%rsp,%rdx), %r10 -; SSE-NEXT: movq %r10, %rbx -; SSE-NEXT: shldq %cl, %r14, %rbx -; SSE-NEXT: shldq %cl, %r8, %r14 -; SSE-NEXT: movq 32(%rsp,%rdx), %r13 -; SSE-NEXT: movq 40(%rsp,%rdx), %r12 -; SSE-NEXT: shldq %cl, %r13, %r12 -; SSE-NEXT: shldq %cl, %r10, %r13 -; SSE-NEXT: movq -16(%rsp,%rdx), %rdx -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq %r12, %rbp -; SSE-NEXT: movq %r9, %r15 -; SSE-NEXT: movq %rsi, %r11 -; SSE-NEXT: movq 16(%rdi), %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r13 -; SSE-NEXT: andq %r8, %r9 -; SSE-NEXT: orq %r13, %r9 -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r12 -; SSE-NEXT: movq 24(%rdi), %r10 -; SSE-NEXT: andq %r10, %rsi -; SSE-NEXT: orq %r12, %rsi -; SSE-NEXT: movq %r14, %r13 -; SSE-NEXT: movq 32(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %r14 -; SSE-NEXT: movq %rdx, %r12 -; SSE-NEXT: movq (%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rcx, %rdx -; SSE-NEXT: orq %r14, %rdx -; SSE-NEXT: orq %r9, %rdx -; SSE-NEXT: movq %rbx, %r14 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: andq %rcx, %rbx -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: movq 8(%rdi), %r8 -; SSE-NEXT: andq %r8, %rax -; SSE-NEXT: orq %rbx, %rax -; SSE-NEXT: orq %rsi, %rax -; SSE-NEXT: notq %r11 -; SSE-NEXT: andq %r10, %r11 -; SSE-NEXT: notq %r15 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq %rcx, %r14 -; SSE-NEXT: notq %r13 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; SSE-NEXT: notq %rbp -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE-NEXT: notq %rcx -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; SSE-NEXT: notq %r9 -; SSE-NEXT: andq %r8, %r9 -; SSE-NEXT: notq %r12 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rcx, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r13, 32(%rdi) -; SSE-NEXT: movq %r14, 40(%rdi) -; SSE-NEXT: movq %r15, 16(%rdi) -; SSE-NEXT: movq %r11, 24(%rdi) -; SSE-NEXT: movq %r12, (%rdi) -; SSE-NEXT: movq %r9, 8(%rdi) -; SSE-NEXT: sete %al -; SSE-NEXT: addq $56, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: reset_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: pushq %rax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rdx -; AVX2-NEXT: movq -48(%rsp,%rdx), %r8 -; AVX2-NEXT: movq -40(%rsp,%rdx), %rbx -; AVX2-NEXT: movq %rbx, %rax -; AVX2-NEXT: shldq %cl, %r8, %rax -; AVX2-NEXT: movq -16(%rsp,%rdx), %r10 -; AVX2-NEXT: movq -8(%rsp,%rdx), %rsi -; AVX2-NEXT: shldq %cl, %r10, %rsi -; AVX2-NEXT: movq -32(%rsp,%rdx), %r11 -; AVX2-NEXT: movq -24(%rsp,%rdx), %r14 -; AVX2-NEXT: movq %r14, %r9 -; AVX2-NEXT: shldq %cl, %r11, %r9 -; AVX2-NEXT: movq -64(%rsp,%rdx), %r15 -; AVX2-NEXT: movq -56(%rsp,%rdx), %rdx -; AVX2-NEXT: shldq %cl, %rdx, %r8 -; AVX2-NEXT: shldq %cl, %r14, %r10 -; AVX2-NEXT: shldq %cl, %rbx, %r11 -; AVX2-NEXT: shldq %cl, %r15, %rdx -; AVX2-NEXT: shlxq %rcx, %r15, %rcx -; AVX2-NEXT: movq 24(%rdi), %rbx -; AVX2-NEXT: movq 56(%rdi), %r14 -; AVX2-NEXT: movq 16(%rdi), %r15 -; AVX2-NEXT: movq 48(%rdi), %r13 -; AVX2-NEXT: movq 32(%rdi), %rbp -; AVX2-NEXT: andnq %rbp, %r11, %r12 -; AVX2-NEXT: andq %r11, %rbp -; AVX2-NEXT: andnq %r13, %r10, %r11 -; AVX2-NEXT: andq %r10, %r13 -; AVX2-NEXT: andnq %r15, %r8, %r10 -; AVX2-NEXT: andq %r8, %r15 -; AVX2-NEXT: movq 40(%rdi), %r8 -; AVX2-NEXT: orq %r13, %r15 -; AVX2-NEXT: andnq %r8, %r9, %r13 -; AVX2-NEXT: andq %r9, %r8 -; AVX2-NEXT: andnq %r14, %rsi, %r9 -; AVX2-NEXT: andq %rsi, %r14 -; AVX2-NEXT: andnq %rbx, %rax, %rsi -; AVX2-NEXT: andq %rax, %rbx -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: orq %r14, %rbx -; AVX2-NEXT: andnq %rax, %rcx, %r14 -; AVX2-NEXT: andq %rcx, %rax -; AVX2-NEXT: orq %rbp, %rax -; AVX2-NEXT: movq 8(%rdi), %rcx -; AVX2-NEXT: orq %r15, %rax -; AVX2-NEXT: andnq %rcx, %rdx, %r15 -; AVX2-NEXT: andq %rdx, %rcx -; AVX2-NEXT: orq %r8, %rcx -; AVX2-NEXT: orq %rbx, %rcx -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: movq %r11, 48(%rdi) -; AVX2-NEXT: movq %r9, 56(%rdi) -; AVX2-NEXT: movq %r12, 32(%rdi) -; AVX2-NEXT: movq %r13, 40(%rdi) -; AVX2-NEXT: movq %r10, 16(%rdi) -; AVX2-NEXT: movq %rsi, 24(%rdi) -; AVX2-NEXT: movq %r14, (%rdi) -; AVX2-NEXT: movq %r15, 8(%rdi) -; AVX2-NEXT: sete %al -; AVX2-NEXT: addq $8, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: reset_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: pushq %rax -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq -48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq -40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %rax -; AVX512-NEXT: shldq %cl, %r8, %rax -; AVX512-NEXT: movq -16(%rsp,%rbx), %r10 -; AVX512-NEXT: movq -8(%rsp,%rbx), %rsi -; AVX512-NEXT: shldq %cl, %r10, %rsi -; AVX512-NEXT: movq -32(%rsp,%rbx), %r11 -; AVX512-NEXT: movq -24(%rsp,%rbx), %r15 -; AVX512-NEXT: movq %r15, %r9 -; AVX512-NEXT: shldq %cl, %r11, %r9 -; AVX512-NEXT: movq -56(%rsp,%rbx), %rdx -; AVX512-NEXT: shldq %cl, %rdx, %r8 -; AVX512-NEXT: shldq %cl, %r15, %r10 -; AVX512-NEXT: shldq %cl, %r14, %r11 -; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx -; AVX512-NEXT: shldq %cl, %rbx, %rdx -; AVX512-NEXT: shlxq %rcx, %rbx, %rcx -; AVX512-NEXT: movq 24(%rdi), %rbx -; AVX512-NEXT: movq 56(%rdi), %r14 -; AVX512-NEXT: movq 16(%rdi), %r15 -; AVX512-NEXT: movq 48(%rdi), %r13 -; AVX512-NEXT: movq 32(%rdi), %rbp -; AVX512-NEXT: andnq %rbp, %r11, %r12 -; AVX512-NEXT: andq %r11, %rbp -; AVX512-NEXT: andnq %r13, %r10, %r11 -; AVX512-NEXT: andq %r10, %r13 -; AVX512-NEXT: andnq %r15, %r8, %r10 -; AVX512-NEXT: andq %r8, %r15 -; AVX512-NEXT: movq 40(%rdi), %r8 -; AVX512-NEXT: orq %r13, %r15 -; AVX512-NEXT: andnq %r8, %r9, %r13 -; AVX512-NEXT: andq %r9, %r8 -; AVX512-NEXT: andnq %r14, %rsi, %r9 -; AVX512-NEXT: andq %rsi, %r14 -; AVX512-NEXT: andnq %rbx, %rax, %rsi -; AVX512-NEXT: andq %rax, %rbx -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: orq %r14, %rbx -; AVX512-NEXT: andnq %rax, %rcx, %r14 -; AVX512-NEXT: andq %rcx, %rax -; AVX512-NEXT: orq %rbp, %rax -; AVX512-NEXT: movq 8(%rdi), %rcx -; AVX512-NEXT: orq %r15, %rax -; AVX512-NEXT: andnq %rcx, %rdx, %r15 -; AVX512-NEXT: andq %rdx, %rcx -; AVX512-NEXT: orq %r8, %rcx -; AVX512-NEXT: orq %rbx, %rcx -; AVX512-NEXT: orq %rax, %rcx -; AVX512-NEXT: movq %r11, 48(%rdi) -; AVX512-NEXT: movq %r9, 56(%rdi) -; AVX512-NEXT: movq %r12, 32(%rdi) -; AVX512-NEXT: movq %r13, 40(%rdi) -; AVX512-NEXT: movq %r10, 16(%rdi) -; AVX512-NEXT: movq %rsi, 24(%rdi) -; AVX512-NEXT: movq %r14, (%rdi) -; AVX512-NEXT: movq %r15, 8(%rdi) -; AVX512-NEXT: sete %al -; AVX512-NEXT: addq $8, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; X64-LABEL: reset_eq_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setae %al +; X64-NEXT: btrl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs @@ -2797,4226 +762,135 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: btl %edx, %edi +; X86-NEXT: setb %al +; X86-NEXT: btsl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl +; +; X64-LABEL: set_ne_i512: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $60, %ecx +; X64-NEXT: movl (%rdi,%rcx), %edx +; X64-NEXT: btl %esi, %edx +; X64-NEXT: setb %al +; X64-NEXT: btsl %esi, %edx +; X64-NEXT: movl %edx, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 511 + %ofs = zext nneg i32 %rem to i512 + %bit = shl nuw i512 1, %ofs + %ld = load i512, ptr %word + %test = and i512 %ld, %bit + %res = or i512 %ld, %bit + %cmp = icmp ne i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { +; X86-LABEL: init_eq_i512: +; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $272, %esp # imm = 0x110 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrl $3, %eax -; X86-NEXT: andl $60, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%edx), %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%edx), %ebx -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%edx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 52(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl 40(%edx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: movl 8(%edx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl 56(%edx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %ebx -; X86-NEXT: movl 24(%edx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%eax), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 12(%eax), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: movl 60(%eax), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 28(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: negl %eax -; X86-NEXT: movl 240(%esp,%eax), %esi -; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: andl $60, %esi +; X86-NEXT: movl (%edx,%esi), %edi +; X86-NEXT: btl %ecx, %edi +; X86-NEXT: setae %al +; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl 32(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: movl (%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl 16(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%esi), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%esi), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl %esi, %edi -; X86-NEXT: movl 52(%eax), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: movl %ebx, 60(%edx) -; X86-NEXT: movl %edi, 56(%edx) -; X86-NEXT: movl %ecx, 52(%edx) -; X86-NEXT: movl %esi, 44(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 40(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 36(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 32(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 28(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 24(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 20(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 16(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 12(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 8(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 4(%edx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl %eax, 48(%edx) -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, (%edx,%esi) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: set_ne_i512: +; SSE-LABEL: init_eq_i512: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $56, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx ; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %rbx -; SSE-NEXT: movq (%rsp,%rbx), %rsi -; SSE-NEXT: movq 8(%rsp,%rbx), %r14 -; SSE-NEXT: movq %r14, %rax -; SSE-NEXT: shldq %cl, %rsi, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 32(%rsp,%rbx), %r8 -; SSE-NEXT: movq 40(%rsp,%rbx), %rbp -; SSE-NEXT: shldq %cl, %r8, %rbp -; SSE-NEXT: movq 16(%rsp,%rbx), %r9 -; SSE-NEXT: movq 24(%rsp,%rbx), %r15 -; SSE-NEXT: movq %r15, %r10 -; SSE-NEXT: shldq %cl, %r9, %r10 -; SSE-NEXT: movq -8(%rsp,%rbx), %r11 -; SSE-NEXT: shldq %cl, %r11, %rsi -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: shldq %cl, %r14, %r9 -; SSE-NEXT: movq -16(%rsp,%rbx), %rbx -; SSE-NEXT: shldq %cl, %rbx, %r11 -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rbx -; SSE-NEXT: movq 24(%rdi), %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 56(%rdi), %rcx -; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 16(%rdi), %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %r8, %r13 -; SSE-NEXT: andq %rsi, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %rcx, %r13 -; SSE-NEXT: andq %rbp, %r13 -; SSE-NEXT: andq %rax, %r15 -; SSE-NEXT: orq %r13, %r15 -; SSE-NEXT: movq 32(%rdi), %r14 -; SSE-NEXT: movq %r14, %rcx -; SSE-NEXT: andq %r9, %rcx -; SSE-NEXT: movq (%rdi), %r13 -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rbx, %r13 -; SSE-NEXT: orq %rcx, %r13 -; SSE-NEXT: orq %r12, %r13 -; SSE-NEXT: movq 40(%rdi), %rcx -; SSE-NEXT: movq %rcx, %r12 -; SSE-NEXT: andq %r10, %r12 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, %rax -; SSE-NEXT: andq %r11, %rax -; SSE-NEXT: orq %r12, %rax -; SSE-NEXT: orq %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: orq %rcx, %r10 -; SSE-NEXT: orq %r14, %r9 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: movq %r8, 48(%rdi) -; SSE-NEXT: movq %rbp, 56(%rdi) -; SSE-NEXT: movq %r9, 32(%rdi) -; SSE-NEXT: movq %r10, 40(%rdi) -; SSE-NEXT: movq %rsi, 16(%rdi) -; SSE-NEXT: movq %r15, 24(%rdi) -; SSE-NEXT: movq %rbx, (%rdi) -; SSE-NEXT: movq %r11, 8(%rdi) -; SSE-NEXT: setne %al -; SSE-NEXT: addq $56, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp +; SSE-NEXT: andl $60, %esi +; SSE-NEXT: movl (%rdi,%rsi), %r8d +; SSE-NEXT: btl %ecx, %r8d +; SSE-NEXT: setae %al +; SSE-NEXT: shll %cl, %edx +; SSE-NEXT: btrl %ecx, %r8d +; SSE-NEXT: orl %r8d, %edx +; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq ; -; AVX2-LABEL: set_ne_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $72, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, (%rsp) -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rbx -; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX2-NEXT: movq %rbp, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX2-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX2-NEXT: shldq %cl, %r8, %r13 -; AVX2-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX2-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX2-NEXT: movq %r14, %r10 -; AVX2-NEXT: shldq %cl, %r9, %r10 -; AVX2-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX2-NEXT: shldq %cl, %r11, %rsi -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 48(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r8, %r14 -; AVX2-NEXT: andq %rsi, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq 56(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r13, %r15 -; AVX2-NEXT: movq 24(%rdi), %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %rax, %r14 -; AVX2-NEXT: orq %r15, %r14 -; AVX2-NEXT: shldq %cl, %rbp, %r9 -; AVX2-NEXT: movq (%rsp,%rbx), %rdx -; AVX2-NEXT: movq 32(%rdi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r9, %r15 -; AVX2-NEXT: shlxq %rcx, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq (%rdi), %rbx -; AVX2-NEXT: movq %rbx, %rbp -; AVX2-NEXT: andq %rax, %rbp -; AVX2-NEXT: orq %r15, %rbp -; AVX2-NEXT: orq %r12, %rbp -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: andq %r10, %rcx -; AVX2-NEXT: movq 8(%rdi), %r15 -; AVX2-NEXT: movq %r15, %r12 -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: orq %rcx, %r12 -; AVX2-NEXT: orq %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX2-NEXT: orq %rax, %r10 -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT: orq %r15, %r11 -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: movq %r8, 48(%rdi) -; AVX2-NEXT: movq %r13, 56(%rdi) -; AVX2-NEXT: movq %r9, 32(%rdi) -; AVX2-NEXT: movq %r10, 40(%rdi) -; AVX2-NEXT: movq %rsi, 16(%rdi) -; AVX2-NEXT: movq %rcx, 24(%rdi) -; AVX2-NEXT: movq %rbx, (%rdi) -; AVX2-NEXT: movq %r11, 8(%rdi) -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $72, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: set_ne_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $72, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, (%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rbx -; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi -; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp -; AVX512-NEXT: movq %rbp, %rax -; AVX512-NEXT: shldq %cl, %rsi, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rsp,%rbx), %r8 -; AVX512-NEXT: movq 56(%rsp,%rbx), %r13 -; AVX512-NEXT: shldq %cl, %r8, %r13 -; AVX512-NEXT: movq 32(%rsp,%rbx), %r9 -; AVX512-NEXT: movq 40(%rsp,%rbx), %r14 -; AVX512-NEXT: movq %r14, %r10 -; AVX512-NEXT: shldq %cl, %r9, %r10 -; AVX512-NEXT: movq 8(%rsp,%rbx), %r11 -; AVX512-NEXT: shldq %cl, %r11, %rsi -; AVX512-NEXT: shldq %cl, %r14, %r8 -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 48(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r8, %r14 -; AVX512-NEXT: andq %rsi, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq 56(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r13, %r15 -; AVX512-NEXT: movq 24(%rdi), %r14 -; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %rax, %r14 -; AVX512-NEXT: orq %r15, %r14 -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: movq (%rsp,%rbx), %rdx -; AVX512-NEXT: movq 32(%rdi), %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r9, %r15 -; AVX512-NEXT: shlxq %rcx, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq (%rdi), %rbx -; AVX512-NEXT: movq %rbx, %rbp -; AVX512-NEXT: andq %rax, %rbp -; AVX512-NEXT: orq %r15, %rbp -; AVX512-NEXT: orq %r12, %rbp -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rdx, %r11 -; AVX512-NEXT: movq 40(%rdi), %rax -; AVX512-NEXT: movq %rax, %rcx -; AVX512-NEXT: andq %r10, %rcx -; AVX512-NEXT: movq 8(%rdi), %r15 -; AVX512-NEXT: movq %r15, %r12 -; AVX512-NEXT: andq %r11, %r12 -; AVX512-NEXT: orq %rcx, %r12 -; AVX512-NEXT: orq %r14, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: orq %rax, %r10 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT: orq %r15, %r11 -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT: orq %rbp, %r12 -; AVX512-NEXT: movq %r8, 48(%rdi) -; AVX512-NEXT: movq %r13, 56(%rdi) -; AVX512-NEXT: movq %r9, 32(%rdi) -; AVX512-NEXT: movq %r10, 40(%rdi) -; AVX512-NEXT: movq %rsi, 16(%rdi) -; AVX512-NEXT: movq %rcx, 24(%rdi) -; AVX512-NEXT: movq %rbx, (%rdi) -; AVX512-NEXT: movq %r11, 8(%rdi) -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $72, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: init_eq_i512: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %ecx +; AVX-NEXT: shrl $3, %ecx +; AVX-NEXT: andl $60, %ecx +; AVX-NEXT: movl (%rdi,%rcx), %r8d +; AVX-NEXT: btl %esi, %r8d +; AVX-NEXT: setae %al +; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: shlxl %esi, %edx, %edx +; AVX-NEXT: orl %r8d, %edx +; AVX-NEXT: movl %edx, (%rdi,%rcx) +; AVX-NEXT: retq %rem = and i32 %position, 511 %ofs = zext nneg i32 %rem to i512 %bit = shl nuw i512 1, %ofs - %ld = load i512, ptr %word - %test = and i512 %ld, %bit - %res = or i512 %ld, %bit - %cmp = icmp ne i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { -; X86-LABEL: init_eq_i512: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $432, %esp # imm = 0x1B0 -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shrl $3, %edx -; X86-NEXT: andl $60, %edx -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %edx, %esi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 56(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%esi), %eax -; X86-NEXT: movl 48(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esi), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%esi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl 16(%ebp), %ebx -; X86-NEXT: movzbl %bl, %esi -; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: shldl %cl, %edi, %edx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 8(%ebp), %ebx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%ebx), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %eax, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%ebx), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%ebx), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 4(%ebx), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edx, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl 56(%edi), %ebx -; X86-NEXT: movl 60(%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 52(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 48(%edi), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl 40(%edi), %ebx -; X86-NEXT: movl 44(%edi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 36(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 32(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 28(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 24(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 20(%edi), %eax -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 16(%edi), %ebx -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 12(%edi), %eax -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: notl %esi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 8(%edi), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: notl %eax -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 4(%edi), %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %edx -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %esi, %eax -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl (%edi), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %ebx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 60(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 56(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 52(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 44(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 40(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 36(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 32(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 28(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 24(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 20(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 16(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %esi, 48(%eax) -; X86-NEXT: sete %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp -; X86-NEXT: retl -; -; SSE-LABEL: init_eq_i512: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $216, %rsp -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %esi -; SSE-NEXT: andl $56, %esi -; SSE-NEXT: negl %esi -; SSE-NEXT: movslq %esi, %r10 -; SSE-NEXT: movq 184(%rsp,%r10), %r11 -; SSE-NEXT: movq 192(%rsp,%r10), %rsi -; SSE-NEXT: movq %rsi, %r13 -; SSE-NEXT: shldq %cl, %r11, %r13 -; SSE-NEXT: movq 200(%rsp,%r10), %r15 -; SSE-NEXT: shldq %cl, %rsi, %r15 -; SSE-NEXT: movq 168(%rsp,%r10), %rbx -; SSE-NEXT: movq 176(%rsp,%r10), %rsi -; SSE-NEXT: movq %rsi, %r14 -; SSE-NEXT: shldq %cl, %rbx, %r14 -; SSE-NEXT: shldq %cl, %rsi, %r11 -; SSE-NEXT: movq 152(%rsp,%r10), %rax -; SSE-NEXT: movq 160(%rsp,%r10), %r8 -; SSE-NEXT: movq %r8, %r12 -; SSE-NEXT: shldq %cl, %rax, %r12 -; SSE-NEXT: shldq %cl, %r8, %rbx -; SSE-NEXT: movq 144(%rsp,%r10), %r9 -; SSE-NEXT: movq %r9, %r8 -; SSE-NEXT: shlq %cl, %r8 -; SSE-NEXT: shldq %cl, %r9, %rax -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movl %edx, %edx -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, (%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq 16(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 48(%rdi), %rsi -; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rsi, %r13 -; SSE-NEXT: andq %rdx, %r12 -; SSE-NEXT: orq %r13, %r12 -; SSE-NEXT: movq %r15, %rsi -; SSE-NEXT: movq 56(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r15 -; SSE-NEXT: movq %rbx, %r13 -; SSE-NEXT: movq 24(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %rbx -; SSE-NEXT: orq %r15, %rbx -; SSE-NEXT: movq %r14, %rbp -; SSE-NEXT: movq 32(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r14 -; SSE-NEXT: movq %r8, %r15 -; SSE-NEXT: movq (%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %r8 -; SSE-NEXT: orq %r14, %r8 -; SSE-NEXT: orq %r12, %r8 -; SSE-NEXT: movq %r11, %r12 -; SSE-NEXT: movq 40(%rdi), %r9 -; SSE-NEXT: andq %r9, %r11 -; SSE-NEXT: movq %rax, %r14 -; SSE-NEXT: movq 8(%rdi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq %rdx, %rax -; SSE-NEXT: orq %r11, %rax -; SSE-NEXT: orq %rbx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: notq %rax -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq 56(%rsp,%r10), %r11 -; SSE-NEXT: movq 64(%rsp,%r10), %rax -; SSE-NEXT: movq %rax, %rbx -; SSE-NEXT: shldq %cl, %r11, %rbx -; SSE-NEXT: orq %rbx, %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: notq %rsi -; SSE-NEXT: movq 72(%rsp,%r10), %rbx -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT: orq %rbx, %rsi -; SSE-NEXT: notq %rbp -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT: movq 40(%rsp,%r10), %rax -; SSE-NEXT: movq 48(%rsp,%r10), %rdx -; SSE-NEXT: movq %rdx, %rbx -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: orq %rbx, %rbp -; SSE-NEXT: notq %r12 -; SSE-NEXT: andq %r9, %r12 -; SSE-NEXT: shldq %cl, %rdx, %r11 -; SSE-NEXT: movq 24(%rsp,%r10), %r9 -; SSE-NEXT: movq 32(%rsp,%r10), %rdx -; SSE-NEXT: movq %rdx, %rbx -; SSE-NEXT: shldq %cl, %r9, %rbx -; SSE-NEXT: orq %r11, %r12 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: notq %r11 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: orq %rbx, %r11 -; SSE-NEXT: notq %r13 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; SSE-NEXT: orq %rax, %r13 -; SSE-NEXT: notq %r15 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT: movq 16(%rsp,%r10), %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: shlq %cl, %rdx -; SSE-NEXT: orq %rdx, %r15 -; SSE-NEXT: notq %r14 -; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shldq %cl, %rax, %r9 -; SSE-NEXT: orq %r9, %r14 -; SSE-NEXT: orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq %rax, 48(%rdi) -; SSE-NEXT: movq %rsi, 56(%rdi) -; SSE-NEXT: movq %rbp, 32(%rdi) -; SSE-NEXT: movq %r12, 40(%rdi) -; SSE-NEXT: movq %r11, 16(%rdi) -; SSE-NEXT: movq %r13, 24(%rdi) -; SSE-NEXT: movq %r15, (%rdi) -; SSE-NEXT: movq %r14, 8(%rdi) -; SSE-NEXT: sete %al -; SSE-NEXT: addq $216, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: init_eq_i512: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $200, %rsp -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %esi, %r8d -; AVX2-NEXT: andl $63, %r8d -; AVX2-NEXT: shrl $3, %esi -; AVX2-NEXT: andl $56, %esi -; AVX2-NEXT: negl %esi -; AVX2-NEXT: movslq %esi, %rsi -; AVX2-NEXT: movq 144(%rsp,%rsi), %r11 -; AVX2-NEXT: movq 152(%rsp,%rsi), %r12 -; AVX2-NEXT: movq %r12, %r10 -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %r11, %r10 -; AVX2-NEXT: movq 176(%rsp,%rsi), %r14 -; AVX2-NEXT: movq 184(%rsp,%rsi), %r9 -; AVX2-NEXT: shldq %cl, %r14, %r9 -; AVX2-NEXT: movq 160(%rsp,%rsi), %r15 -; AVX2-NEXT: movq 168(%rsp,%rsi), %r13 -; AVX2-NEXT: movq %r13, %rbx -; AVX2-NEXT: shldq %cl, %r15, %rbx -; AVX2-NEXT: movq 128(%rsp,%rsi), %rbp -; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 136(%rsp,%rsi), %rax -; AVX2-NEXT: shldq %cl, %rax, %r11 -; AVX2-NEXT: shldq %cl, %r13, %r14 -; AVX2-NEXT: shldq %cl, %r12, %r15 -; AVX2-NEXT: shldq %cl, %rbp, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl %edx, %edx -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq %rdx, (%rsp) -; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: movq 16(%rdi), %r12 -; AVX2-NEXT: movq 48(%rdi), %rbp -; AVX2-NEXT: movq 32(%rdi), %r13 -; AVX2-NEXT: andnq %r13, %r15, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r15, %r13 -; AVX2-NEXT: andnq %rbp, %r14, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r14, %rbp -; AVX2-NEXT: andnq %r12, %r11, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r11, %r12 -; AVX2-NEXT: movq 40(%rdi), %rax -; AVX2-NEXT: orq %rbp, %r12 -; AVX2-NEXT: andnq %rax, %rbx, %rcx -; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq %rax, %rbp -; AVX2-NEXT: andq %rbx, %rbp -; AVX2-NEXT: movq 56(%rdi), %rcx -; AVX2-NEXT: andnq %rcx, %r9, %rbx -; AVX2-NEXT: andq %r9, %rcx -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: andnq %rax, %r10, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq %r10, %rax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT: movq (%rdi), %r10 -; AVX2-NEXT: andnq %r10, %rcx, %r15 -; AVX2-NEXT: andq %rcx, %r10 -; AVX2-NEXT: movq 40(%rsp,%rsi), %rdx -; AVX2-NEXT: movq 48(%rsp,%rsi), %r11 -; AVX2-NEXT: movq %r11, %r9 -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %rdx, %r9 -; AVX2-NEXT: orq %r13, %r10 -; AVX2-NEXT: orq %r12, %r10 -; AVX2-NEXT: movq 8(%rdi), %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andnq %r13, %rcx, %r12 -; AVX2-NEXT: andq %rcx, %r13 -; AVX2-NEXT: orq %rbp, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq 56(%rsp,%rsi), %rax -; AVX2-NEXT: movl %r8d, %ecx -; AVX2-NEXT: shldq %cl, %r11, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: orq %r9, %r14 -; AVX2-NEXT: orq %rax, %rbx -; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 24(%rsp,%rsi), %rax -; AVX2-NEXT: movq 32(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, %r11 -; AVX2-NEXT: shldq %cl, %rax, %r11 -; AVX2-NEXT: shldq %cl, %r9, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX2-NEXT: orq %r11, %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: orq %rdx, %rbx -; AVX2-NEXT: movq 8(%rsp,%rsi), %rdx -; AVX2-NEXT: movq 16(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, %r11 -; AVX2-NEXT: shldq %cl, %rdx, %r11 -; AVX2-NEXT: shldq %cl, %r9, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: orq %r11, %r9 -; AVX2-NEXT: movq (%rsp,%rsi), %rsi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: shlxq %r8, %rsi, %rax -; AVX2-NEXT: shldq %cl, %rsi, %rdx -; AVX2-NEXT: orq %rax, %r15 -; AVX2-NEXT: orq %rdx, %r12 -; AVX2-NEXT: orq %r10, %r13 -; AVX2-NEXT: movq %r14, 48(%rdi) -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: movq %rax, 56(%rdi) -; AVX2-NEXT: movq %rbp, 32(%rdi) -; AVX2-NEXT: movq %rbx, 40(%rdi) -; AVX2-NEXT: movq %r9, 16(%rdi) -; AVX2-NEXT: movq %r11, 24(%rdi) -; AVX2-NEXT: movq %r15, (%rdi) -; AVX2-NEXT: movq %r12, 8(%rdi) -; AVX2-NEXT: sete %al -; AVX2-NEXT: addq $200, %rsp -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: init_eq_i512: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $184, %rsp -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %esi -; AVX512-NEXT: andl $56, %esi -; AVX512-NEXT: negl %esi -; AVX512-NEXT: movslq %esi, %rsi -; AVX512-NEXT: movq 128(%rsp,%rsi), %r10 -; AVX512-NEXT: movq 136(%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rax -; AVX512-NEXT: shldq %cl, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 160(%rsp,%rsi), %r14 -; AVX512-NEXT: movq 168(%rsp,%rsi), %rax -; AVX512-NEXT: shldq %cl, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 144(%rsp,%rsi), %r15 -; AVX512-NEXT: movq 152(%rsp,%rsi), %r11 -; AVX512-NEXT: movq %r11, %rbx -; AVX512-NEXT: shldq %cl, %r15, %rbx -; AVX512-NEXT: movq 120(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rax, %r10 -; AVX512-NEXT: shldq %cl, %r11, %r14 -; AVX512-NEXT: movq %rdi, %r9 -; AVX512-NEXT: movq 112(%rsp,%rsi), %r11 -; AVX512-NEXT: shldq %cl, %r12, %r15 -; AVX512-NEXT: movl %edx, %edx -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: movq 16(%rdi), %r12 -; AVX512-NEXT: movq 48(%rdi), %r13 -; AVX512-NEXT: movq 32(%rdi), %rbp -; AVX512-NEXT: andnq %rbp, %r15, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r15, %rbp -; AVX512-NEXT: andnq %r13, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r14, %r13 -; AVX512-NEXT: andnq %r12, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq %r10, %r12 -; AVX512-NEXT: movq 40(%rdi), %r8 -; AVX512-NEXT: orq %r13, %r12 -; AVX512-NEXT: andnq %r8, %rbx, %rdi -; AVX512-NEXT: andq %rbx, %r8 -; AVX512-NEXT: movq 56(%r9), %r13 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: andnq %r13, %rdx, %r10 -; AVX512-NEXT: andq %rdx, %r13 -; AVX512-NEXT: movq 24(%r9), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: andnq %rax, %rdx, %r15 -; AVX512-NEXT: andq %rdx, %rax -; AVX512-NEXT: orq %r13, %rax -; AVX512-NEXT: shlxq %rcx, %r11, %r13 -; AVX512-NEXT: movq (%r9), %rdx -; AVX512-NEXT: andnq %rdx, %r13, %r14 -; AVX512-NEXT: andq %r13, %rdx -; AVX512-NEXT: orq %rbp, %rdx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r11, %rbp -; AVX512-NEXT: orq %r12, %rdx -; AVX512-NEXT: movq 8(%r9), %r13 -; AVX512-NEXT: andnq %r13, %rbp, %rbx -; AVX512-NEXT: andq %rbp, %r13 -; AVX512-NEXT: orq %r8, %r13 -; AVX512-NEXT: movq 24(%rsp,%rsi), %r8 -; AVX512-NEXT: orq %rax, %r13 -; AVX512-NEXT: movq 32(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, %r12 -; AVX512-NEXT: shldq %cl, %r8, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: orq %r12, %r11 -; AVX512-NEXT: movq 40(%rsp,%rsi), %r12 -; AVX512-NEXT: shldq %cl, %rax, %r12 -; AVX512-NEXT: orq %r12, %r10 -; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 8(%rsp,%rsi), %rax -; AVX512-NEXT: movq 16(%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rbp -; AVX512-NEXT: shldq %cl, %rax, %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: orq %rbp, %r10 -; AVX512-NEXT: shldq %cl, %r12, %r8 -; AVX512-NEXT: orq %r8, %rdi -; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq -8(%rsp,%rsi), %r8 -; AVX512-NEXT: movq (%rsp,%rsi), %r12 -; AVX512-NEXT: movq %r12, %rbp -; AVX512-NEXT: shldq %cl, %r8, %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: orq %rbp, %rdi -; AVX512-NEXT: movq -16(%rsp,%rsi), %rsi -; AVX512-NEXT: shldq %cl, %r12, %rax -; AVX512-NEXT: orq %rax, %r15 -; AVX512-NEXT: shlxq %rcx, %rsi, %rax -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rsi, %r8 -; AVX512-NEXT: orq %rax, %r14 -; AVX512-NEXT: orq %r8, %rbx -; AVX512-NEXT: orq %rdx, %r13 -; AVX512-NEXT: movq %r11, 48(%r9) -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq %rax, 56(%r9) -; AVX512-NEXT: movq %r10, 32(%r9) -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq %rax, 40(%r9) -; AVX512-NEXT: movq %rdi, 16(%r9) -; AVX512-NEXT: movq %r15, 24(%r9) -; AVX512-NEXT: movq %r14, (%r9) -; AVX512-NEXT: movq %rbx, 8(%r9) -; AVX512-NEXT: sete %al -; AVX512-NEXT: addq $184, %rsp -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %rem = and i32 %position, 511 - %ofs = zext nneg i32 %rem to i512 - %bit = shl nuw i512 1, %ofs - %mask = xor i512 %bit, -1 - %val0 = zext i1 %value to i512 - %val = shl nuw i512 %val0, %ofs + %mask = xor i512 %bit, -1 + %val0 = zext i1 %value to i512 + %val = shl nuw i512 %val0, %ofs %ld = load i512, ptr %word %test = and i512 %ld, %bit %res0 = and i512 %ld, %mask %res = or i512 %res0, %val - %cmp = icmp eq i512 %test, 0 - store i512 %res, ptr %word - ret i1 %cmp -} - -; i4096 - -define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { -; X86-LABEL: test_ne_i4096: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $1792, %esp # imm = 0x700 -; X86-NEXT: movl 12(%ebp), %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: andl $508, %ecx # imm = 0x1FC -; X86-NEXT: leal {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 248(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 252(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl $31, %ebx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 504(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 508(%esi), %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 120(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 124(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 376(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 380(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 184(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 188(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 440(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 444(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 56(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 60(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 312(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 316(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 216(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 220(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 472(%esi), %edi -; X86-NEXT: movl 476(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 88(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 92(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 344(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 348(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 152(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 156(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 408(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 412(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 24(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 280(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 284(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 232(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 236(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 488(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 492(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 104(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 108(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 360(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 364(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 168(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 172(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 424(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 428(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 296(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 300(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 200(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 204(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 456(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 460(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 72(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 76(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 328(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 332(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 136(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 140(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 392(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 396(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 12(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 264(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 268(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 240(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 244(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 496(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 500(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 112(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 116(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 368(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 372(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 176(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 180(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 432(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 436(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 52(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 304(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 308(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 208(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 212(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 464(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 468(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 80(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 84(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 336(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 340(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 144(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 148(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 400(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 404(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 16(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 20(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 272(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 276(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 224(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 228(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 480(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 484(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 100(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 352(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 356(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 160(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 164(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 416(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 420(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 288(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 292(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 192(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 196(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 448(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 452(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 64(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 68(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 320(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 324(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 128(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 132(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, %edx -; X86-NEXT: movl 256(%esi), %edi -; X86-NEXT: movl 260(%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 388(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl 4(%esi), %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shrdl $1, %eax, %edi -; X86-NEXT: shrl %eax -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: notb %cl -; X86-NEXT: shrdl %cl, %eax, %edi -; X86-NEXT: shrl %cl, %ebx -; X86-NEXT: movb $32, %cl -; X86-NEXT: testb %cl, %cl -; X86-NEXT: movl (%esi), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: jne .LBB20_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: .LBB20_2: -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shll %cl, %edx -; X86-NEXT: orl %ebx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 320(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 64(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 448(%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 192(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 288(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 32(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 416(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 160(%eax), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 352(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 96(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 480(%eax), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 224(%eax), %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: orl %edi, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 272(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 16(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 400(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 144(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 336(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 80(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 464(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 208(%eax), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 304(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 48(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 432(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 176(%eax), %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 368(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 112(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 496(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: andl 240(%eax), %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: orl %esi, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 264(%eax), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 8(%eax), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 392(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 136(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 328(%ebx), %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 72(%ebx), %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 456(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 200(%ebx), %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 296(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 40(%ebx), %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 424(%ebx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 168(%ebx), %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 360(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 104(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 488(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 232(%ebx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 280(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 24(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 408(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 152(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 344(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 88(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 472(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 216(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 312(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 56(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 440(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 184(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 376(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 120(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 504(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 248(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 324(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 68(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 452(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 196(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 292(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 36(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 420(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 164(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 356(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 100(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 484(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 228(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 276(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 20(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 404(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 148(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 340(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 84(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 468(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 212(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 308(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 52(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 436(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 180(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 372(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 116(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 500(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 244(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 268(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 12(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 396(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 140(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 332(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 76(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 460(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 204(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 300(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 44(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 428(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 172(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 364(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 108(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 492(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: andl 236(%ebx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 284(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 28(%ebx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 412(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 156(%ebx), %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 348(%ebx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 92(%ebx), %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 476(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 220(%ebx), %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 316(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 60(%ebx), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 444(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: andl 188(%ebx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 380(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: andl 124(%ebx), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 508(%ebx), %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: andl 252(%esi), %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: orl %edx, %ebx -; X86-NEXT: orl %edi, %ebx -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: negl %ecx -; X86-NEXT: movl 1648(%esp,%ecx), %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl %cl, %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl 8(%ebp), %edx -; X86-NEXT: andl 128(%edx), %ecx -; X86-NEXT: andl 384(%edx), %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NEXT: shll %cl, %eax -; X86-NEXT: andl (%edx), %eax -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 256(%edx), %eax -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 260(%edx), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: andl 4(%edx), %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl 132(%edx), %eax -; X86-NEXT: andl 388(%edx), %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: orl %edi, %esi -; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp + %cmp = icmp eq i512 %test, 0 + store i512 %res, ptr %word + ret i1 %cmp +} + +; i4096 + +define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { +; X86-LABEL: test_ne_i4096: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $4064, %edx # imm = 0xFE0 +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%eax,%edx), %eax +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setb %al ; X86-NEXT: retl ; -; SSE-LABEL: test_ne_i4096: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r13 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $1576, %rsp # imm = 0x628 -; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl %esi, %eax -; SSE-NEXT: andl $4032, %eax # imm = 0xFC0 -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp) -; SSE-NEXT: andl $63, %ecx -; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: negl %eax -; SSE-NEXT: movslq %eax, %rsi -; SSE-NEXT: movq 1296(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1304(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1552(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1560(%rsp,%rsi), %rax -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1168(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1176(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1424(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1432(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1232(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1240(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1488(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1496(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1104(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1112(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1360(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; SSE-NEXT: movq 1368(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1264(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1272(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1520(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1528(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1136(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1144(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1392(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1400(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1200(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1208(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1456(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1464(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1072(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1080(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1328(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1336(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1280(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1288(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1536(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1544(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1152(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1160(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1408(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1416(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1216(%rsp,%rsi), %r11 -; SSE-NEXT: movq 1224(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %r11, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1472(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1480(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1088(%rsp,%rsi), %r9 -; SSE-NEXT: movq 1096(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %r9, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1344(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1352(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1248(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1256(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1504(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1512(%rsp,%rsi), %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rdx, %rax -; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1120(%rsp,%rsi), %rax -; SSE-NEXT: movq 1128(%rsp,%rsi), %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: shldq %cl, %rax, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1376(%rsp,%rsi), %r13 -; SSE-NEXT: movq 1384(%rsp,%rsi), %rbx -; SSE-NEXT: movq %rbx, %r8 -; SSE-NEXT: shldq %cl, %r13, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1184(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1192(%rsp,%rsi), %r15 -; SSE-NEXT: movq %r15, %r14 -; SSE-NEXT: shldq %cl, %rdx, %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1440(%rsp,%rsi), %r10 -; SSE-NEXT: movq 1448(%rsp,%rsi), %rdx -; SSE-NEXT: movq %rdx, %r14 -; SSE-NEXT: shldq %cl, %r10, %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1312(%rsp,%rsi), %r14 -; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq 1320(%rsp,%rsi), %rbp -; SSE-NEXT: movq %rbp, %r12 -; SSE-NEXT: shldq %cl, %r14, %r12 -; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, (%rsp) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq 1064(%rsp,%rsi), %rbx -; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE-NEXT: shldq %cl, %rbp, %r14 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rdx, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r9 -; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rbp -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r15, %r13 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r12, %r15 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: shldq %cl, %r12, %r10 -; SSE-NEXT: andq 384(%rdi), %r10 -; SSE-NEXT: andq 128(%rdi), %r15 -; SSE-NEXT: andq 320(%rdi), %r13 -; SSE-NEXT: andq 64(%rdi), %rax -; SSE-NEXT: orq %r10, %r15 -; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: orq %r13, %rax -; SSE-NEXT: andq 448(%rdi), %r9 -; SSE-NEXT: andq 192(%rdi), %rbp -; SSE-NEXT: orq %r9, %rbp -; SSE-NEXT: orq %rax, %rbp -; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: andq 288(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 32(%rdi), %r9 -; SSE-NEXT: andq 416(%rdi), %rdx -; SSE-NEXT: andq 160(%rdi), %r11 -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 352(%rdi), %rdx -; SSE-NEXT: orq %r9, %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 96(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 480(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 224(%rdi), %r8 -; SSE-NEXT: orq %rax, %r8 -; SSE-NEXT: orq %rdx, %r8 -; SSE-NEXT: andq 272(%rdi), %r14 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 16(%rdi), %rax -; SSE-NEXT: orq %r14, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 400(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 144(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 336(%rdi), %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 80(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 464(%rdi), %rdx -; SSE-NEXT: orq %r9, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 208(%rdi), %r11 -; SSE-NEXT: orq %rdx, %r11 -; SSE-NEXT: orq %rax, %r11 -; SSE-NEXT: orq %r8, %r11 -; SSE-NEXT: movq (%rsp), %rdx # 8-byte Reload -; SSE-NEXT: andq 304(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 48(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 432(%rdi), %r9 -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 176(%rdi), %r8 -; SSE-NEXT: orq %r9, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 368(%rdi), %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 112(%rdi), %rax -; SSE-NEXT: orq %r10, %r8 -; SSE-NEXT: movq %r8, %r10 -; SSE-NEXT: orq %r9, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 496(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE-NEXT: andq 240(%rdi), %rbp -; SSE-NEXT: orq %r8, %rbp -; SSE-NEXT: orq %rax, %rbp -; SSE-NEXT: orq %r10, %rbp -; SSE-NEXT: orq %r11, %rbp -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 392(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT: andq 136(%rdi), %r12 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 328(%rdi), %rdx -; SSE-NEXT: orq %rax, %r12 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 72(%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 456(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; SSE-NEXT: andq 200(%rdi), %r13 -; SSE-NEXT: orq %rax, %r13 -; SSE-NEXT: orq %rdx, %r13 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 296(%rdi), %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 40(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 424(%rdi), %r8 -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: movq %rax, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: andq 168(%rdi), %rdx -; SSE-NEXT: orq %r8, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 360(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 104(%rdi), %rax -; SSE-NEXT: orq %r9, %rdx -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 488(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT: andq 232(%rdi), %r15 -; SSE-NEXT: orq %rax, %r15 -; SSE-NEXT: orq %r8, %r15 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 280(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 24(%rdi), %rax -; SSE-NEXT: orq %rdx, %r15 -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 408(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 152(%rdi), %rax -; SSE-NEXT: orq %r8, %rax -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 344(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 88(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 472(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE-NEXT: andq 216(%rdi), %r14 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: orq %rax, %r14 -; SSE-NEXT: orq %r8, %r14 -; SSE-NEXT: orq %r10, %r14 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 312(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: andq 56(%rdi), %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 440(%rdi), %r8 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT: andq 184(%rdi), %r9 -; SSE-NEXT: orq %r11, %r10 -; SSE-NEXT: orq %r8, %r9 -; SSE-NEXT: orq %r10, %r9 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT: andq 376(%rdi), %r10 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT: andq 120(%rdi), %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT: andq 504(%rdi), %r11 -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT: andq 248(%rdi), %r8 -; SSE-NEXT: orq %r10, %rax -; SSE-NEXT: movq %rax, %r10 -; SSE-NEXT: orq %r11, %r8 -; SSE-NEXT: movq 1056(%rsp,%rsi), %rax -; SSE-NEXT: shldq %cl, %rax, %rbx -; SSE-NEXT: # kill: def $cl killed $cl killed $ecx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: orq %r10, %r8 -; SSE-NEXT: orq %r9, %r8 -; SSE-NEXT: andq 256(%rdi), %rdx -; SSE-NEXT: orq %r14, %r8 -; SSE-NEXT: andq (%rdi), %rax -; SSE-NEXT: orq %rdx, %rax -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT: orq %rbp, %rax -; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE-NEXT: andq 264(%rdi), %rcx -; SSE-NEXT: andq 8(%rdi), %rbx -; SSE-NEXT: orq %rcx, %rbx -; SSE-NEXT: orq %r12, %rbx -; SSE-NEXT: orq %r13, %rbx -; SSE-NEXT: orq %r15, %rbx -; SSE-NEXT: orq %r8, %rbx -; SSE-NEXT: orq %rax, %rbx -; SSE-NEXT: setne %al -; SSE-NEXT: addq $1576, %rsp # imm = 0x628 -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r13 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX2-LABEL: test_ne_i4096: -; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl %esi, %eax -; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: andl $4032, %eax # imm = 0xFC0 -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT: andl $63, %ecx -; AVX2-NEXT: shrl $3, %eax -; AVX2-NEXT: negl %eax -; AVX2-NEXT: movslq %eax, %rsi -; AVX2-NEXT: movq 1280(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1288(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1536(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1544(%rsp,%rsi), %rax -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1152(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1160(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1408(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1416(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1216(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; AVX2-NEXT: movq 1224(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1472(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1480(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1088(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1096(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1344(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1352(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1248(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1256(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1504(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1512(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1120(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1128(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1376(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1384(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1184(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1192(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1440(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1448(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1056(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1064(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1312(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1320(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1264(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1272(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1520(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1528(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1136(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1144(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1392(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1400(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1200(%rsp,%rsi), %r11 -; AVX2-NEXT: movq 1208(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %r11, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1456(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1464(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1072(%rsp,%rsi), %r12 -; AVX2-NEXT: movq 1080(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %r12, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1328(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1336(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rdx, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1232(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1240(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1488(%rsp,%rsi), %rbp -; AVX2-NEXT: movq 1496(%rsp,%rsi), %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rbp, %rax -; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1104(%rsp,%rsi), %rax -; AVX2-NEXT: movq 1112(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1360(%rsp,%rsi), %r10 -; AVX2-NEXT: movq 1368(%rsp,%rsi), %r8 -; AVX2-NEXT: movq %r8, %rdx -; AVX2-NEXT: shldq %cl, %r10, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1168(%rsp,%rsi), %r9 -; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1176(%rsp,%rsi), %rbx -; AVX2-NEXT: movq %rbx, %rdx -; AVX2-NEXT: shldq %cl, %r9, %rdx -; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1424(%rsp,%rsi), %r9 -; AVX2-NEXT: movq 1432(%rsp,%rsi), %rdx -; AVX2-NEXT: movq %rdx, %r14 -; AVX2-NEXT: shldq %cl, %r9, %r14 -; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1296(%rsp,%rsi), %r15 -; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq 1304(%rsp,%rsi), %r14 -; AVX2-NEXT: movq %r14, %r13 -; AVX2-NEXT: shldq %cl, %r15, %r13 -; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, (%rsp) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq 1048(%rsp,%rsi), %rdx -; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r14, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r15, %r9 -; AVX2-NEXT: andq 384(%rdi), %r9 -; AVX2-NEXT: andq 128(%rdi), %r14 -; AVX2-NEXT: andq 320(%rdi), %r10 -; AVX2-NEXT: orq %r9, %r14 -; AVX2-NEXT: movq %r14, %r15 -; AVX2-NEXT: andq 64(%rdi), %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: andq 448(%rdi), %rbp -; AVX2-NEXT: andq 192(%rdi), %r13 -; AVX2-NEXT: orq %rbp, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: andq 288(%rdi), %r8 -; AVX2-NEXT: andq 32(%rdi), %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 416(%rdi), %rax -; AVX2-NEXT: orq %r8, %r12 -; AVX2-NEXT: andq 160(%rdi), %r11 -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: andq 352(%rdi), %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 96(%rdi), %rax -; AVX2-NEXT: orq %r12, %r11 -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 480(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: andq 224(%rdi), %r13 -; AVX2-NEXT: orq %r10, %r13 -; AVX2-NEXT: orq %rax, %r13 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 272(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 16(%rdi), %rax -; AVX2-NEXT: orq %r11, %r13 -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 400(%rdi), %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 144(%rdi), %rax -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 336(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 80(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 464(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 208(%rdi), %r11 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: orq %r8, %r11 -; AVX2-NEXT: orq %rax, %r11 -; AVX2-NEXT: orq %r9, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 304(%rdi), %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 48(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 432(%rdi), %r10 -; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload -; AVX2-NEXT: andq 176(%rdi), %rax -; AVX2-NEXT: orq %r9, %r8 -; AVX2-NEXT: movq %r8, %r9 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 368(%rdi), %r8 -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 112(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 496(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT: andq 240(%rdi), %r9 -; AVX2-NEXT: orq %r8, %r9 -; AVX2-NEXT: orq %rax, %r9 -; AVX2-NEXT: orq %r10, %r9 -; AVX2-NEXT: orq %r11, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 392(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX2-NEXT: andq 136(%rdi), %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 328(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 72(%rdi), %rax -; AVX2-NEXT: orq %r10, %rbp -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 456(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX2-NEXT: andq 200(%rdi), %r12 -; AVX2-NEXT: orq %rax, %r12 -; AVX2-NEXT: orq %r8, %r12 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 296(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 40(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 424(%rdi), %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 168(%rdi), %rax -; AVX2-NEXT: orq %r10, %r8 -; AVX2-NEXT: movq %r8, %r10 -; AVX2-NEXT: orq %r11, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 360(%rdi), %r8 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 104(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 488(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: andq 232(%rdi), %r14 -; AVX2-NEXT: orq %rax, %r14 -; AVX2-NEXT: orq %r8, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 280(%rdi), %r8 -; AVX2-NEXT: orq %r10, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 24(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 408(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 152(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: andq 344(%rdi), %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 88(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 472(%rdi), %rax -; AVX2-NEXT: orq %r11, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: andq 216(%rdi), %rbx -; AVX2-NEXT: orq %rax, %rbx -; AVX2-NEXT: orq %r8, %rbx -; AVX2-NEXT: orq %r10, %rbx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 312(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 56(%rdi), %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 440(%rdi), %r10 -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: movq %rax, %r11 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 184(%rdi), %r8 -; AVX2-NEXT: orq %r10, %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: andq 376(%rdi), %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 120(%rdi), %rax -; AVX2-NEXT: orq %r11, %r8 -; AVX2-NEXT: movq %r8, %r11 -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: andq 504(%rdi), %r8 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT: andq 248(%rdi), %rax -; AVX2-NEXT: orq %r8, %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: shldq %cl, %r8, %r10 -; AVX2-NEXT: orq %r11, %rax -; AVX2-NEXT: movq 1040(%rsp,%rsi), %rsi -; AVX2-NEXT: orq %rbx, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: shlxq %rcx, %rsi, %rax -; AVX2-NEXT: andq 256(%rdi), %r10 -; AVX2-NEXT: andq (%rdi), %rax -; AVX2-NEXT: orq %r10, %rax -; AVX2-NEXT: orq %r15, %rax -; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX2-NEXT: orq %r13, %rax -; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT: shldq %cl, %rsi, %rdx -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: andq 264(%rdi), %rcx -; AVX2-NEXT: andq 8(%rdi), %rdx -; AVX2-NEXT: orq %r9, %rax -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: orq %rbp, %rdx -; AVX2-NEXT: orq %r12, %rdx -; AVX2-NEXT: orq %r14, %rdx -; AVX2-NEXT: orq %r8, %rdx -; AVX2-NEXT: orq %rax, %rdx -; AVX2-NEXT: setne %al -; AVX2-NEXT: addq $1560, %rsp # imm = 0x618 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_ne_i4096: -; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: andl $4032, %eax # imm = 0xFC0 -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT: andl $63, %ecx -; AVX512-NEXT: shrl $3, %eax -; AVX512-NEXT: negl %eax -; AVX512-NEXT: movslq %eax, %rsi -; AVX512-NEXT: movq 1280(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1288(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1536(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1544(%rsp,%rsi), %rax -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1152(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1160(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1408(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1416(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1216(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; AVX512-NEXT: movq 1224(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1472(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1480(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1088(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1096(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1344(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1352(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1248(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1256(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1504(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1512(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1120(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1128(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1376(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1384(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1184(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1192(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1440(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1448(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1056(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1064(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1312(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1320(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1264(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1272(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1520(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1528(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1136(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1144(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1392(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1400(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1200(%rsp,%rsi), %r10 -; AVX512-NEXT: movq 1208(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %r10, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1456(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1464(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1072(%rsp,%rsi), %r14 -; AVX512-NEXT: movq 1080(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %r14, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1328(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1336(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1232(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1240(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1488(%rsp,%rsi), %r12 -; AVX512-NEXT: movq 1496(%rsp,%rsi), %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %r12, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1104(%rsp,%rsi), %rax -; AVX512-NEXT: movq 1112(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1360(%rsp,%rsi), %r11 -; AVX512-NEXT: movq 1368(%rsp,%rsi), %rbx -; AVX512-NEXT: movq %rbx, %rdx -; AVX512-NEXT: shldq %cl, %r11, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1168(%rsp,%rsi), %r9 -; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1176(%rsp,%rsi), %r8 -; AVX512-NEXT: movq %r8, %rdx -; AVX512-NEXT: shldq %cl, %r9, %rdx -; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1424(%rsp,%rsi), %r9 -; AVX512-NEXT: movq 1432(%rsp,%rsi), %rdx -; AVX512-NEXT: movq %rdx, %r15 -; AVX512-NEXT: shldq %cl, %r9, %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1296(%rsp,%rsi), %rbp -; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq 1304(%rsp,%rsi), %r15 -; AVX512-NEXT: movq %r15, %r13 -; AVX512-NEXT: shldq %cl, %rbp, %r13 -; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, (%rsp) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq 1048(%rsp,%rsi), %rdx -; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %rbx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r13 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %r15, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbp, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rbp, %r9 -; AVX512-NEXT: andq 384(%rdi), %r9 -; AVX512-NEXT: andq 128(%rdi), %r15 -; AVX512-NEXT: orq %r9, %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: andq 320(%rdi), %r11 -; AVX512-NEXT: andq 64(%rdi), %rax -; AVX512-NEXT: orq %r11, %rax -; AVX512-NEXT: andq 448(%rdi), %r12 -; AVX512-NEXT: andq 192(%rdi), %r13 -; AVX512-NEXT: orq %r12, %r13 -; AVX512-NEXT: orq %rax, %r13 -; AVX512-NEXT: andq 288(%rdi), %r8 -; AVX512-NEXT: andq 32(%rdi), %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 416(%rdi), %rax -; AVX512-NEXT: orq %r8, %r14 -; AVX512-NEXT: andq 160(%rdi), %r10 -; AVX512-NEXT: orq %rax, %r10 -; AVX512-NEXT: andq 352(%rdi), %rbx -; AVX512-NEXT: orq %r14, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 96(%rdi), %rax -; AVX512-NEXT: orq %rbx, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 480(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: andq 224(%rdi), %r15 -; AVX512-NEXT: orq %rax, %r15 -; AVX512-NEXT: orq %r8, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 272(%rdi), %r8 -; AVX512-NEXT: orq %r10, %r15 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 16(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 400(%rdi), %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 144(%rdi), %rax -; AVX512-NEXT: orq %r9, %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 336(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 80(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 464(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: andq 208(%rdi), %r11 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: orq %r8, %r11 -; AVX512-NEXT: orq %rax, %r11 -; AVX512-NEXT: orq %r9, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 304(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 48(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 432(%rdi), %r9 -; AVX512-NEXT: movq (%rsp), %r8 # 8-byte Reload -; AVX512-NEXT: andq 176(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: orq %r9, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 368(%rdi), %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 112(%rdi), %rax -; AVX512-NEXT: orq %r10, %r8 -; AVX512-NEXT: movq %r8, %r10 -; AVX512-NEXT: orq %r9, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 496(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT: andq 240(%rdi), %r9 -; AVX512-NEXT: orq %r8, %r9 -; AVX512-NEXT: orq %rax, %r9 -; AVX512-NEXT: orq %r10, %r9 -; AVX512-NEXT: orq %r11, %r9 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 392(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: andq 136(%rdi), %rbp -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 328(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 72(%rdi), %rax -; AVX512-NEXT: orq %r10, %rbp -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 456(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX512-NEXT: andq 200(%rdi), %r12 -; AVX512-NEXT: orq %rax, %r12 -; AVX512-NEXT: orq %r8, %r12 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 296(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 40(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 424(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 168(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 360(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 104(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 488(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX512-NEXT: andq 232(%rdi), %r14 -; AVX512-NEXT: orq %rax, %r14 -; AVX512-NEXT: orq %r8, %r14 -; AVX512-NEXT: orq %r10, %r14 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 280(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 24(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 408(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 152(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: andq 344(%rdi), %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 88(%rdi), %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 472(%rdi), %rax -; AVX512-NEXT: orq %r11, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: andq 216(%rdi), %rbx -; AVX512-NEXT: orq %rax, %rbx -; AVX512-NEXT: orq %r8, %rbx -; AVX512-NEXT: orq %r10, %rbx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: andq 312(%rdi), %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 56(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 440(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 184(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 376(%rdi), %r8 -; AVX512-NEXT: orq %r10, %rax -; AVX512-NEXT: movq %rax, %r11 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 120(%rdi), %rax -; AVX512-NEXT: orq %r8, %rax -; AVX512-NEXT: movq %rax, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 504(%rdi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: andq 248(%rdi), %r8 -; AVX512-NEXT: orq %rax, %r8 -; AVX512-NEXT: orq %r10, %r8 -; AVX512-NEXT: orq %r11, %r8 -; AVX512-NEXT: movq 1040(%rsp,%rsi), %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: shldq %cl, %rsi, %r10 -; AVX512-NEXT: orq %rbx, %r8 -; AVX512-NEXT: shlxq %rcx, %rax, %rsi -; AVX512-NEXT: andq 256(%rdi), %r10 -; AVX512-NEXT: andq (%rdi), %rsi -; AVX512-NEXT: orq %r10, %rsi -; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT: orq %r13, %rsi -; AVX512-NEXT: orq %r15, %rsi -; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT: shldq %cl, %rax, %rdx -; AVX512-NEXT: orq %r9, %rsi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: andq 264(%rdi), %rax -; AVX512-NEXT: andq 8(%rdi), %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: orq %rbp, %rdx -; AVX512-NEXT: orq %r12, %rdx -; AVX512-NEXT: orq %r14, %rdx -; AVX512-NEXT: orq %r8, %rdx -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: setne %al -; AVX512-NEXT: addq $1560, %rsp # imm = 0x618 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; X64-LABEL: test_ne_i4096: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl $4064, %eax # imm = 0xFE0 +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl (%rdi,%rax), %eax +; X64-NEXT: btl %esi, %eax +; X64-NEXT: setb %al +; X64-NEXT: retq %rem = and i32 %position, 4095 %ofs = zext nneg i32 %rem to i4096 %bit = shl nuw i4096 1, %ofs @@ -7032,115 +906,46 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind { define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_cmpz_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $64, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $1, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: shrb $3, %al -; X86-NEXT: andb $12, %al -; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %esi -; X86-NEXT: movl 36(%esp,%esi), %eax -; X86-NEXT: movl 40(%esp,%esi), %edi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: shldl %cl, %eax, %edx -; X86-NEXT: movl 32(%esp,%esi), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%esp,%esi), %esi -; X86-NEXT: shldl %cl, %edi, %esi -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: shldl %cl, %ebx, %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: xorl 12(%ecx), %esi -; X86-NEXT: xorl 8(%ecx), %edx -; X86-NEXT: xorl 4(%ecx), %eax -; X86-NEXT: xorl (%ecx), %edi -; X86-NEXT: movl %edx, 8(%ecx) -; X86-NEXT: movl %esi, 12(%ecx) -; X86-NEXT: movl %edi, (%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %edx, %edi -; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $1, %edx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: andl $96, %ecx +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: xorl %edx, (%eax,%ecx) +; X86-NEXT: movl (%eax), %ecx +; X86-NEXT: movl 4(%eax), %edx +; X86-NEXT: orl 12(%eax), %edx +; X86-NEXT: orl 8(%eax), %ecx +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: setne %al -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; SSE-LABEL: complement_cmpz_i128: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx ; SSE-NEXT: movl $1, %eax -; SSE-NEXT: xorl %edx, %edx -; SSE-NEXT: shldq %cl, %rax, %rdx -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: xorl %esi, %esi -; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rax, %rdx -; SSE-NEXT: cmovneq %rsi, %rax -; SSE-NEXT: xorq 8(%rdi), %rdx -; SSE-NEXT: xorq (%rdi), %rax -; SSE-NEXT: movq %rax, (%rdi) -; SSE-NEXT: movq %rdx, 8(%rdi) -; SSE-NEXT: orq %rdx, %rax +; SSE-NEXT: shll %cl, %eax +; SSE-NEXT: andl $96, %ecx +; SSE-NEXT: shrl $3, %ecx +; SSE-NEXT: xorl %eax, (%rdi,%rcx) +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: orq 8(%rdi), %rax ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; -; AVX2-LABEL: complement_cmpz_i128: -; AVX2: # %bb.0: -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: movl $1, %eax -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: shldq %cl, %rax, %rdx -; AVX2-NEXT: xorl %esi, %esi -; AVX2-NEXT: shlxq %rcx, %rax, %rax -; AVX2-NEXT: testb $64, %cl -; AVX2-NEXT: cmovneq %rax, %rdx -; AVX2-NEXT: cmovneq %rsi, %rax -; AVX2-NEXT: xorq 8(%rdi), %rdx -; AVX2-NEXT: xorq (%rdi), %rax -; AVX2-NEXT: movq %rax, (%rdi) -; AVX2-NEXT: movq %rdx, 8(%rdi) -; AVX2-NEXT: orq %rdx, %rax -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq -; -; AVX512-LABEL: complement_cmpz_i128: -; AVX512: # %bb.0: -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: movl $1, %edx -; AVX512-NEXT: xorl %esi, %esi -; AVX512-NEXT: shldq %cl, %rdx, %rsi -; AVX512-NEXT: shlxq %rcx, %rdx, %rdx -; AVX512-NEXT: testb $64, %cl -; AVX512-NEXT: cmovneq %rdx, %rsi -; AVX512-NEXT: cmovneq %rax, %rdx -; AVX512-NEXT: xorq 8(%rdi), %rsi -; AVX512-NEXT: xorq (%rdi), %rdx -; AVX512-NEXT: movq %rdx, (%rdi) -; AVX512-NEXT: movq %rsi, 8(%rdi) -; AVX512-NEXT: orq %rsi, %rdx -; AVX512-NEXT: setne %al -; AVX512-NEXT: retq +; AVX-LABEL: complement_cmpz_i128: +; AVX: # %bb.0: +; AVX-NEXT: # kill: def $esi killed $esi def $rsi +; AVX-NEXT: movl $1, %eax +; AVX-NEXT: shlxl %esi, %eax, %eax +; AVX-NEXT: andl $96, %esi +; AVX-NEXT: shrl $3, %esi +; AVX-NEXT: xorl %eax, (%rdi,%rsi) +; AVX-NEXT: movq (%rdi), %rax +; AVX-NEXT: orq 8(%rdi), %rax +; AVX-NEXT: setne %al +; AVX-NEXT: retq %rem = and i32 %position, 127 %ofs = zext nneg i32 %rem to i128 %bit = shl nuw i128 1, %ofs @@ -7155,14 +960,152 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-LABEL: reset_multiload_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $96, %esi +; X86-NEXT: shrl $3, %esi +; X86-NEXT: movl (%ecx,%esi), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: btrl %edx, %ebx +; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl %ebx, (%ecx,%esi) +; X86-NEXT: jae .LBB22_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: .LBB22_2: +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl +; +; X64-LABEL: reset_multiload_i128: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: andl $96, %ecx +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: movl (%rdi,%rcx), %r9d +; X64-NEXT: movl %r9d, %r8d +; X64-NEXT: btrl %esi, %r8d +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: btl %esi, %r9d +; X64-NEXT: jb .LBB22_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: movl (%rdx), %eax +; X64-NEXT: .LBB22_2: +; X64-NEXT: movl %r8d, (%rdi,%rcx) +; X64-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %mask = xor i128 %bit, -1 + %ld = load i128, ptr %word + %sel = load i32, ptr %p + %test = and i128 %ld, %bit + %res = and i128 %ld, %mask + %cmp = icmp eq i128 %test, 0 + store i128 %res, ptr %word + %ret = select i1 %cmp, i32 %sel, i32 0 + ret i32 %ret +} + +; Multiple uses of the store chain AND stored value +define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind { +; X86-LABEL: chain_reset_i256: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-2, %edi +; X86-NEXT: roll %cl, %edi +; X86-NEXT: shrl $3, %ecx +; X86-NEXT: andl $28, %ecx +; X86-NEXT: andl %edi, (%esi,%ecx) +; X86-NEXT: movl 8(%esi), %ebx +; X86-NEXT: movl (%esi), %edi +; X86-NEXT: movl 4(%esi), %ecx +; X86-NEXT: movl 12(%esi), %ebp +; X86-NEXT: orl 28(%esi), %ebp +; X86-NEXT: orl 20(%esi), %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl 24(%esi), %ebx +; X86-NEXT: movl 16(%esi), %ebp +; X86-NEXT: orl %edi, %ebp +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: movl (%edx), %esi +; X86-NEXT: movl %edi, (%edx) +; X86-NEXT: movl (%eax), %eax +; X86-NEXT: orl %ecx, %ebp +; X86-NEXT: jne .LBB23_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: addl %esi, %eax +; X86-NEXT: .LBB23_2: +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl +; +; X64-LABEL: chain_reset_i256: +; X64: # %bb.0: +; X64-NEXT: # kill: def $ecx killed $ecx def $rcx +; X64-NEXT: movl $-2, %eax +; X64-NEXT: roll %cl, %eax +; X64-NEXT: shrl $3, %ecx +; X64-NEXT: andl $28, %ecx +; X64-NEXT: andl %eax, (%rdi,%rcx) +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: movq 8(%rdi), %r8 +; X64-NEXT: orq 24(%rdi), %r8 +; X64-NEXT: movq 16(%rdi), %rdi +; X64-NEXT: orq %rcx, %rdi +; X64-NEXT: movl (%rsi), %eax +; X64-NEXT: movl %ecx, (%rsi) +; X64-NEXT: movl (%rdx), %ecx +; X64-NEXT: addl %ecx, %eax +; X64-NEXT: orq %r8, %rdi +; X64-NEXT: cmovnel %ecx, %eax +; X64-NEXT: retq + %rem = and i32 %position, 255 + %ofs = zext nneg i32 %rem to i256 + %bit = shl nuw i256 1, %ofs + %ld0 = load i256, ptr %p0 + %msk = xor i256 %bit, -1 + %res = and i256 %ld0, %msk + store i256 %res, ptr %p0 + %cmp = icmp ne i256 %res, 0 + %ld1 = load i32, ptr %p1 + %trunc = trunc i256 %res to i32 + store i32 %trunc, ptr %p1 + %ld2 = load i32, ptr %p2 + %add = add i32 %ld1, %ld2 + %sel = select i1 %cmp, i32 %ld2, i32 %add + ret i32 %sel +} + +; BTC/BT/BTS sequence on same i128 +define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind { +; X86-LABEL: sequence_i128: +; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp -; X86-NEXT: movzbl 12(%ebp), %ecx +; X86-NEXT: subl $144, %esp +; X86-NEXT: movb 20(%ebp), %ch +; X86-NEXT: movb 12(%ebp), %cl ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -7176,54 +1119,80 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-NEXT: andb $12, %al ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 56(%esp,%eax), %esi -; X86-NEXT: movl 60(%esp,%eax), %edx -; X86-NEXT: shldl %cl, %esi, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 48(%esp,%eax), %edi -; X86-NEXT: movl 52(%esp,%eax), %eax -; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl 56(%esp,%eax), %edx +; X86-NEXT: movl 60(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl %cl, %edi, %eax -; X86-NEXT: movl 8(%ebp), %ebx +; X86-NEXT: movl 48(%esp,%eax), %edi +; X86-NEXT: movl 52(%esp,%eax), %ebx +; X86-NEXT: shldl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %ebx ; X86-NEXT: shll %cl, %edi -; X86-NEXT: movl 8(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %esi, %ecx -; X86-NEXT: movl (%ebx), %esi +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movb %ch, %al +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 84(%esp,%eax), %edx +; X86-NEXT: movl 88(%esp,%eax), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl 20(%ebp), %ecx +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %edi, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: movl 12(%ebx), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl 4(%ebx), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: andl %ebx, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: notl %ecx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: notl %ebx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: notl %edx -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: notl %edi -; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl 92(%esp,%eax), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %eax +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: xorl 8(%eax), %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: xorl 12(%eax), %esi +; X86-NEXT: xorl (%eax), %edi +; X86-NEXT: xorl 4(%eax), %ebx +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl 16(%ebp), %eax -; X86-NEXT: movl (%eax), %eax -; X86-NEXT: movl 8(%ebp), %esi -; X86-NEXT: movl %ebx, 8(%esi) -; X86-NEXT: movl %ecx, 12(%esi) -; X86-NEXT: movl %edi, (%esi) -; X86-NEXT: movl %edx, 4(%esi) -; X86-NEXT: je .LBB22_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB22_2: +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: andb $96, %al +; X86-NEXT: shrb $3, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl 96(%esp,%eax), %eax +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: btl %ecx, %eax +; X86-NEXT: setae %al +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl %edx, 8(%ecx) +; X86-NEXT: movl %esi, 12(%ecx) +; X86-NEXT: movl %edi, (%ecx) +; X86-NEXT: movl %ebx, 4(%ecx) ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -7231,73 +1200,129 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-NEXT: popl %ebp ; X86-NEXT: retl ; -; SSE-LABEL: reset_multiload_i128: +; SSE-LABEL: sequence_i128: ; SSE: # %bb.0: +; SSE-NEXT: movl %ecx, %eax ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl $1, %esi -; SSE-NEXT: xorl %r8d, %r8d -; SSE-NEXT: shldq %cl, %rsi, %r8 -; SSE-NEXT: xorl %eax, %eax -; SSE-NEXT: shlq %cl, %rsi +; SSE-NEXT: movl $1, %r8d +; SSE-NEXT: xorl %esi, %esi +; SSE-NEXT: shldq %cl, %r8, %rsi +; SSE-NEXT: movl $1, %r9d +; SSE-NEXT: shlq %cl, %r9 +; SSE-NEXT: xorl %r11d, %r11d ; SSE-NEXT: testb $64, %cl -; SSE-NEXT: cmovneq %rsi, %r8 -; SSE-NEXT: cmovneq %rax, %rsi -; SSE-NEXT: movq (%rdi), %rcx -; SSE-NEXT: movq 8(%rdi), %r9 -; SSE-NEXT: movq %r9, %r10 -; SSE-NEXT: andq %r8, %r10 -; SSE-NEXT: notq %r8 -; SSE-NEXT: movq %rcx, %r11 -; SSE-NEXT: andq %rsi, %r11 -; SSE-NEXT: notq %rsi -; SSE-NEXT: andq %r9, %r8 -; SSE-NEXT: andq %rcx, %rsi -; SSE-NEXT: orq %r10, %r11 -; SSE-NEXT: jne .LBB22_2 -; SSE-NEXT: # %bb.1: -; SSE-NEXT: movl (%rdx), %eax -; SSE-NEXT: .LBB22_2: -; SSE-NEXT: movq %rsi, (%rdi) -; SSE-NEXT: movq %r8, 8(%rdi) -; SSE-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-NEXT: cmovneq %r9, %rsi +; SSE-NEXT: cmovneq %r11, %r9 +; SSE-NEXT: xorl %r10d, %r10d +; SSE-NEXT: movl %eax, %ecx +; SSE-NEXT: shldq %cl, %r8, %r10 +; SSE-NEXT: shlq %cl, %r8 +; SSE-NEXT: testb $64, %al +; SSE-NEXT: cmovneq %r8, %r10 +; SSE-NEXT: cmovneq %r11, %r8 +; SSE-NEXT: xorq 8(%rdi), %rsi +; SSE-NEXT: xorq (%rdi), %r9 +; SSE-NEXT: movl %edx, %ecx +; SSE-NEXT: andb $32, %cl +; SSE-NEXT: movq %r9, %rax +; SSE-NEXT: shrdq %cl, %rsi, %rax +; SSE-NEXT: movq %rsi, %r11 +; SSE-NEXT: shrq %cl, %r11 +; SSE-NEXT: testb $64, %dl +; SSE-NEXT: cmoveq %rax, %r11 +; SSE-NEXT: btl %edx, %r11d +; SSE-NEXT: setae %al +; SSE-NEXT: orq %r10, %rsi +; SSE-NEXT: orq %r8, %r9 +; SSE-NEXT: movq %r9, (%rdi) +; SSE-NEXT: movq %rsi, 8(%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: reset_multiload_i128: -; AVX: # %bb.0: -; AVX-NEXT: movl %esi, %ecx -; AVX-NEXT: movl $1, %esi -; AVX-NEXT: xorl %r8d, %r8d -; AVX-NEXT: shldq %cl, %rsi, %r8 -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: shlxq %rcx, %rsi, %r9 -; AVX-NEXT: testb $64, %cl -; AVX-NEXT: cmovneq %r9, %r8 -; AVX-NEXT: cmovneq %rax, %r9 -; AVX-NEXT: movq (%rdi), %r10 -; AVX-NEXT: movq 8(%rdi), %r11 -; AVX-NEXT: andnq %r11, %r8, %rcx -; AVX-NEXT: andq %r8, %r11 -; AVX-NEXT: andnq %r10, %r9, %rsi -; AVX-NEXT: andq %r9, %r10 -; AVX-NEXT: orq %r11, %r10 -; AVX-NEXT: jne .LBB22_2 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl (%rdx), %eax -; AVX-NEXT: .LBB22_2: -; AVX-NEXT: movq %rsi, (%rdi) -; AVX-NEXT: movq %rcx, 8(%rdi) -; AVX-NEXT: # kill: def $eax killed $eax killed $rax -; AVX-NEXT: retq - %rem = and i32 %position, 127 - %ofs = zext nneg i32 %rem to i128 - %bit = shl nuw i128 1, %ofs - %mask = xor i128 %bit, -1 +; AVX2-LABEL: sequence_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: movl $1, %r10d +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shldq %cl, %r10, %rsi +; AVX2-NEXT: shlxq %rcx, %r10, %r8 +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %r8, %rsi +; AVX2-NEXT: cmovneq %r9, %r8 +; AVX2-NEXT: xorl %r11d, %r11d +; AVX2-NEXT: movl %eax, %ecx +; AVX2-NEXT: shldq %cl, %r10, %r11 +; AVX2-NEXT: shlxq %rax, %r10, %r10 +; AVX2-NEXT: testb $64, %al +; AVX2-NEXT: cmovneq %r10, %r11 +; AVX2-NEXT: cmovneq %r9, %r10 +; AVX2-NEXT: xorq 8(%rdi), %rsi +; AVX2-NEXT: xorq (%rdi), %r8 +; AVX2-NEXT: movl %edx, %ecx +; AVX2-NEXT: andb $32, %cl +; AVX2-NEXT: movq %r8, %rax +; AVX2-NEXT: shrdq %cl, %rsi, %rax +; AVX2-NEXT: shrxq %rcx, %rsi, %rcx +; AVX2-NEXT: testb $64, %dl +; AVX2-NEXT: cmoveq %rax, %rcx +; AVX2-NEXT: btl %edx, %ecx +; AVX2-NEXT: setae %al +; AVX2-NEXT: orq %r11, %rsi +; AVX2-NEXT: orq %r10, %r8 +; AVX2-NEXT: movq %r8, (%rdi) +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: sequence_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: movl %ecx, %eax +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: movl $1, %r9d +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shldq %cl, %r9, %rsi +; AVX512-NEXT: xorl %r10d, %r10d +; AVX512-NEXT: shlxq %rcx, %r9, %r8 +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %r8, %rsi +; AVX512-NEXT: cmovneq %r10, %r8 +; AVX512-NEXT: xorl %r11d, %r11d +; AVX512-NEXT: movl %eax, %ecx +; AVX512-NEXT: shldq %cl, %r9, %r11 +; AVX512-NEXT: shlxq %rax, %r9, %r9 +; AVX512-NEXT: testb $64, %al +; AVX512-NEXT: cmovneq %r9, %r11 +; AVX512-NEXT: cmovneq %r10, %r9 +; AVX512-NEXT: xorq 8(%rdi), %rsi +; AVX512-NEXT: xorq (%rdi), %r8 +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: andb $32, %cl +; AVX512-NEXT: movq %r8, %rax +; AVX512-NEXT: shrdq %cl, %rsi, %rax +; AVX512-NEXT: shrxq %rcx, %rsi, %rcx +; AVX512-NEXT: testb $64, %dl +; AVX512-NEXT: cmoveq %rax, %rcx +; AVX512-NEXT: btl %edx, %ecx +; AVX512-NEXT: setae %al +; AVX512-NEXT: orq %r11, %rsi +; AVX512-NEXT: orq %r9, %r8 +; AVX512-NEXT: movq %r8, (%rdi) +; AVX512-NEXT: movq %rsi, 8(%rdi) +; AVX512-NEXT: retq + %rem0 = and i32 %pos0, 127 + %rem1 = and i32 %pos1, 127 + %rem2 = and i32 %pos2, 127 + %ofs0 = zext nneg i32 %rem0 to i128 + %ofs1 = zext nneg i32 %rem1 to i128 + %ofs2 = zext nneg i32 %rem2 to i128 + %bit0 = shl nuw i128 1, %ofs0 + %bit1 = shl nuw i128 1, %ofs1 + %bit2 = shl nuw i128 1, %ofs2 %ld = load i128, ptr %word - %sel = load i32, ptr %p - %test = and i128 %ld, %bit - %res = and i128 %ld, %mask - %cmp = icmp eq i128 %test, 0 - store i128 %res, ptr %word - %ret = select i1 %cmp, i32 %sel, i32 0 - ret i32 %ret + %res0 = xor i128 %ld, %bit0 + %test1 = and i128 %res0, %bit1 + %cmp1 = icmp eq i128 %test1, 0 + %res2 = or i128 %res0, %bit2 + store i128 %res2, ptr %word + ret i1 %cmp1 } diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll index feac3dcad243a..30f1874c51fed 100644 --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_shl_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift @@ -1876,15 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_lshr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1} +; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; GFNIAVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift @@ -2232,36 +2231,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX512BW-LABEL: var_ashr_v64i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; GFNIAVX512BW-NEXT: kmovq %rax, %k1 +; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; GFNIAVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index 0fb0420bb2609..aff2228c258b5 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 -; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index 103d5702fb93a..4450d07e01cca 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = lshr <64 x i8> %a, %b diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll index efd742956ed09..41238acc4b74d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512BW-LABEL: var_shift_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2 -; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2 +; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 +; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} ; AVX512BW-NEXT: retq %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift diff --git a/llvm/test/DebugInfo/debug-bool-const-value.ll b/llvm/test/DebugInfo/debug-bool-const-value.ll new file mode 100644 index 0000000000000..84cf993cf4aae --- /dev/null +++ b/llvm/test/DebugInfo/debug-bool-const-value.ll @@ -0,0 +1,29 @@ +; REQUIRES: object-emission +; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s + +; CHECK: {{.*}}DW_TAG_variable +; CHECK-NEXT: {{.*}} DW_AT_const_value (1) +; CHECK-NEXT: {{.*}} DW_AT_name ("arg") + +define void @test() !dbg !5 +{ +entry: + call void @"llvm.dbg.value"(metadata i1 true, metadata !7, metadata !8), !dbg !6 + ret void, !dbg !6 +} + +declare void @"llvm.dbg.value"(metadata %".1", metadata %".2", metadata %".3") + +!llvm.dbg.cu = !{ !2 } +!llvm.module.flags = !{ !9, !10 } + +!1 = !DIFile(directory: "", filename: "test") +!2 = distinct !DICompileUnit(emissionKind: FullDebug, file: !1, isOptimized: false, language: DW_LANG_C_plus_plus, runtimeVersion: 0) +!3 = !DIBasicType(encoding: DW_ATE_boolean, name: "bool", size: 8) +!4 = !DISubroutineType(types: !{null}) +!5 = distinct !DISubprogram(file: !1, isDefinition: true, isLocal: false, isOptimized: false, line: 5, linkageName: "test", name: "test", scope: !1, scopeLine: 5, type: !4, unit: !2) +!6 = !DILocation(column: 1, line: 5, scope: !5) +!7 = !DILocalVariable(arg: 0, file: !1, line: 5, name: "arg", scope: !5, type: !3) +!8 = !DIExpression() +!9 = !{ i32 2, !"Dwarf Version", i32 4 } +!10 = !{ i32 2, !"Debug Info Version", i32 3 } diff --git a/llvm/test/Feature/intrinsics.ll b/llvm/test/Feature/intrinsics.ll index b6abc0fff6db7..a2da8f29116e1 100644 --- a/llvm/test/Feature/intrinsics.ll +++ b/llvm/test/Feature/intrinsics.ll @@ -64,7 +64,7 @@ define void @libm() { ; FIXME: test ALL the intrinsics in this file. -; CHECK: declare void @llvm.trap() #1 +; CHECK: declare void @llvm.trap() #2 declare void @llvm.trap() define void @trap() { @@ -72,5 +72,4 @@ define void @trap() { ret void } -; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; CHECK: attributes #1 = { cold noreturn nounwind memory(inaccessiblemem: write) } +; CHECK: attributes #2 = { cold noreturn nounwind memory(inaccessiblemem: write) } diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll new file mode 100644 index 0000000000000..877fe5fe4b393 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll @@ -0,0 +1,355 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll +; +; Strictly handled: +; - llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) +; - llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) +; - llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) +; +; Heuristically handled: (none) + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) #3 + +define <8 x i64> @test_mm512_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x i64> @test_mm512_cvtne2ps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <32 x bfloat> [[TMP6]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP7]] +; +entry: + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %1 = bitcast <32 x bfloat> %0 to <8 x i64> + ret <8 x i64> %1 +} + +define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B, i32 %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP7:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32 [[TMP2]] to <32 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32 [[U]] to <32 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <32 x bfloat> [[TMP7]] to <32 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = xor <32 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <32 x i16> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP8]], <32 x i16> [[TMP14]], <32 x i16> [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = select <32 x i1> [[TMP9]], <32 x bfloat> [[TMP7]], <32 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x bfloat> [[TMP15]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP17]] +; +entry: + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %1 = bitcast i32 %U to <32 x i1> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer + %3 = bitcast <32 x bfloat> %2 to <8 x i64> + ret <8 x i64> %3 +} + +define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(<8 x i64> %C, i32 %U, <16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512( +; CHECK-SAME: <8 x i64> [[C:%.*]], i32 [[U:%.*]], <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i64> [[C]] to <32 x bfloat> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP3]] to <32 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32 [[U]] to <32 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <32 x i1> [[TMP12]], <32 x i16> zeroinitializer, <32 x i16> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <32 x bfloat> [[TMP8]] to <32 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <32 x bfloat> [[TMP10]] to <32 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = xor <32 x i16> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = or <32 x i16> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = select <32 x i1> [[TMP12]], <32 x bfloat> [[TMP8]], <32 x bfloat> [[TMP10]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <32 x bfloat> [[TMP19]] to <8 x i64> +; CHECK-NEXT: store <8 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x i64> [[TMP21]] +; +entry: + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %1 = bitcast <8 x i64> %C to <32 x bfloat> + %2 = bitcast i32 %U to <32 x i1> + %3 = select <32 x i1> %2, <32 x bfloat> %0, <32 x bfloat> %1 + %4 = bitcast <32 x bfloat> %3 to <8 x i64> + ret <8 x i64> %4 +} + +declare <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) #3 + +define <4 x i64> @test_mm512_cvtneps2bf16_512(<16 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm512_cvtneps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]] +; CHECK: [[BB2]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x bfloat> [[TMP4]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP5]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %1 = bitcast <16 x bfloat> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(<16 x float> %A, i16 %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512( +; CHECK-SAME: <16 x float> [[A:%.*]], i16 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[TMP5:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x bfloat> [[TMP5]] to <16 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = xor <16 x i16> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i16> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i16> [[TMP12]], <16 x i16> [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP7]], <16 x bfloat> [[TMP5]], <16 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP13]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP15]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + %3 = bitcast <16 x bfloat> %2 to <4 x i64> + ret <4 x i64> %3 +} + +define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(<4 x i64> %C, i16 %U, <16 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm512_mask_cvtneps2bf16_512( +; CHECK-SAME: <4 x i64> [[C:%.*]], i16 [[U:%.*]], <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i64> [[TMP1]] to <16 x i16> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i16> zeroinitializer, <16 x i16> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <16 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i16> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i16> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i16> [[TMP15]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> [[TMP16]], <16 x i16> [[TMP11]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x bfloat> [[TMP6]], <16 x bfloat> [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <16 x bfloat> [[TMP17]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP19]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %1 = bitcast <4 x i64> %C to <16 x bfloat> + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1 + %4 = bitcast <16 x bfloat> %3 to <4 x i64> + ret <4 x i64> %4 +} + +declare <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <32 x bfloat>, <32 x bfloat>) #3 + +define <16 x float> @test_mm512_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_dpbf16ps_512( +; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]]) +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP8]] +; +entry: + %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4 + ret <16 x float> %0 +} + +define <16 x float> @test_mm512_maskz_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B, i16 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_maskz_dpbf16ps_512( +; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <16 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP16]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> zeroinitializer +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP17]] +; +entry: + %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer + ret <16 x float> %2 +} +define <16 x float> @test_mm512_mask_dpbf16ps_512(i16 zeroext %U, <16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <16 x float> @test_mm512_mask_dpbf16ps_512( +; CHECK-SAME: i16 zeroext [[U:%.*]], <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x float> [[E]] to <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP0]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[E]] +; CHECK-NEXT: store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x float> [[TMP18]] +; +entry: + %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %E + ret <16 x float> %2 +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll new file mode 100644 index 0000000000000..ac65645a9ec2c --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512bf16-mov.ll +; +; Strictly handled: (none) +; +; Heuristically handled: (none) + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local void @funbf16(ptr readonly %src, ptr writeonly %dst) sanitize_memory { +; CHECK-LABEL: define dso_local void @funbf16( +; CHECK-SAME: ptr readonly [[SRC:%.*]], ptr writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB2]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP4]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = xor i64 [[TMP10]], 87960930222080 +; CHECK-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr +; CHECK-NEXT: store <8 x i16> [[_MSLD]], ptr [[TMP12]], align 1 +; CHECK-NEXT: store <8 x bfloat> [[TMP4]], ptr [[DST]], align 1 +; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP5]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]] +; CHECK: [[BB13]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 32 +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = xor i64 [[TMP16]], 87960930222080 +; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr +; CHECK-NEXT: [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP18]], align 32 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP6]], label %[[BB19:.*]], label %[[BB20:.*]], !prof [[PROF1]] +; CHECK: [[BB19]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB20]]: +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080 +; CHECK-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr +; CHECK-NEXT: store <8 x i16> [[_MSLD1]], ptr [[TMP23]], align 32 +; CHECK-NEXT: store <8 x bfloat> [[TMP15]], ptr [[DST]], align 32 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP7]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]] +; CHECK: [[BB24]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB25]]: +; CHECK-NEXT: [[TMP26:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP28:%.*]] = xor i64 [[TMP27]], 87960930222080 +; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr +; CHECK-NEXT: [[_MSLD2:%.*]] = load <16 x i16>, ptr [[TMP29]], align 1 +; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP8]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]] +; CHECK: [[BB30]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB31]]: +; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP33:%.*]] = xor i64 [[TMP32]], 87960930222080 +; CHECK-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr +; CHECK-NEXT: store <16 x i16> [[_MSLD2]], ptr [[TMP34]], align 1 +; CHECK-NEXT: store <16 x bfloat> [[TMP26]], ptr [[DST]], align 1 +; CHECK-NEXT: [[_MSCMP9:%.*]] = icmp ne i64 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[_MSCMP9]], label %[[BB35:.*]], label %[[BB36:.*]], !prof [[PROF1]] +; CHECK: [[BB35]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB36]]: +; CHECK-NEXT: [[TMP37:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 32 +; CHECK-NEXT: [[TMP38:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080 +; CHECK-NEXT: [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr +; CHECK-NEXT: [[_MSLD3:%.*]] = load <16 x i16>, ptr [[TMP40]], align 32 +; CHECK-NEXT: [[_MSCMP10:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP10]], label %[[BB41:.*]], label %[[BB42:.*]], !prof [[PROF1]] +; CHECK: [[BB41]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR3]] +; CHECK-NEXT: unreachable +; CHECK: [[BB42]]: +; CHECK-NEXT: [[TMP43:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: [[TMP44:%.*]] = xor i64 [[TMP43]], 87960930222080 +; CHECK-NEXT: [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr +; CHECK-NEXT: store <16 x i16> [[_MSLD3]], ptr [[TMP45]], align 32 +; CHECK-NEXT: store <16 x bfloat> [[TMP37]], ptr [[DST]], align 32 +; CHECK-NEXT: ret void +; +entry: + %0 = load <8 x bfloat>, ptr %src, align 1 + store <8 x bfloat> %0, ptr %dst, align 1 + %1 = load <8 x bfloat>, ptr %src, align 32 + store <8 x bfloat> %1, ptr %dst, align 32 + %2 = load <16 x bfloat>, ptr %src, align 1 + store <16 x bfloat> %2, ptr %dst, align 1 + %3 = load <16 x bfloat>, ptr %src, align 32 + store <16 x bfloat> %3, ptr %dst, align 32 + ret void +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll new file mode 100644 index 0000000000000..904614e961d6c --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll @@ -0,0 +1,774 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=msan -mattr=+avx512bf16 -mattr=+avx512vl < %s | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll +; +; Strictly handled: +; - llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) +; - llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) +; - llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) +; - llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) +; - llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) +; - llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %6, <4 x i1> %4) +; +; Heuristically handled: (none) + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1 + +define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm_cvtne2ps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP7]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = xor <8 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i16> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i16> [[TMP14]], <8 x i16> [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = select <8 x i1> [[TMP9]], <8 x bfloat> [[TMP7]], <8 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP15]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP17]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + %3 = bitcast <8 x bfloat> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i16> zeroinitializer, <8 x i16> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <8 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = xor <8 x i16> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i16> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i16> [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i16> [[TMP18]], <8 x i16> [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP12]], <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP10]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP21]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %1 = bitcast <2 x i64> %C to <8 x bfloat> + %2 = bitcast i8 %U to <8 x i1> + %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3 + +define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm256_cvtne2ps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP7]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %1 = bitcast <16 x bfloat> %0 to <4 x i64> + ret <4 x i64> %1 +} + +define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]] +; CHECK: [[BB5]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB6]]: +; CHECK-NEXT: [[TMP7:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]]) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x bfloat> [[TMP7]] to <16 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = xor <16 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <16 x i16> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i16> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i16> [[TMP14]], <16 x i16> [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = select <16 x i1> [[TMP9]], <16 x bfloat> [[TMP7]], <16 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x bfloat> [[TMP15]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP17]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %1 = bitcast i16 %U to <16 x i1> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + %3 = bitcast <16 x bfloat> %2 to <4 x i64> + ret <4 x i64> %3 +} + +define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory { +; CHECK-LABEL: define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256( +; CHECK-SAME: <4 x i64> [[C:%.*]], i16 zeroext [[U:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]]) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to <16 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i16 [[U]] to <16 x i1> +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i16> zeroinitializer, <16 x i16> [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP10]] to <16 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i16> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i16> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i16> [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i16> [[TMP18]], <16 x i16> [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP12]], <16 x bfloat> [[TMP8]], <16 x bfloat> [[TMP10]] +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <16 x bfloat> [[TMP19]] to <4 x i64> +; CHECK-NEXT: store <4 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x i64> [[TMP21]] +; +entry: + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %1 = bitcast <4 x i64> %C to <16 x bfloat> + %2 = bitcast i16 %U to <16 x i1> + %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1 + %4 = bitcast <16 x bfloat> %3 to <4 x i64> + ret <4 x i64> %4 +} + +declare <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3 + +define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm256_cvtneps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]] +; CHECK: [[BB2]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]]) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP5]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256( +; CHECK-SAME: <8 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB4]]: +; CHECK-NEXT: [[TMP5:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]]) +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8 [[TMP1]] to <8 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <8 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i16> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <8 x i16> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> [[TMP12]], <8 x i16> [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP7]], <8 x bfloat> [[TMP5]], <8 x bfloat> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP15]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + %3 = bitcast <8 x bfloat> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm256_mask_cvtneps2bf16_256( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i16> zeroinitializer, <8 x i16> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16> +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i16> [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i16> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i16> [[TMP15]], [[TMP7]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> [[TMP16]], <8 x i16> [[TMP11]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP10]], <8 x bfloat> [[TMP6]], <8 x bfloat> [[TMP8]] +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP19]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %1 = bitcast <2 x i64> %C to <8 x bfloat> + %2 = bitcast i8 %U to <8 x i1> + %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x bfloat>, <4 x i1>) #3 + +define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP2]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> + ret <2 x i64> %1 +} + +define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128( +; CHECK-SAME: <4 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> zeroinitializer, <4 x i1> [[TMP4]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP9]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP10]] +; +entry: + %0 = bitcast i8 %U to <8 x i1> + %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> zeroinitializer, <4 x i1> %1) #4 + %3 = bitcast <8 x bfloat> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_mask_cvtneps2bf16_128( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP6]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i4 [[TMP10]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]] +; CHECK: [[BB11]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB12]]: +; CHECK-NEXT: [[TMP13:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> [[TMP7]], <4 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP14]] +; +entry: + %0 = bitcast i8 %U to <8 x i1> + %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %2 = bitcast <2 x i64> %C to <8 x bfloat> + %3 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %2, <4 x i1> %1) #4 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128_select( +; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat> +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP4]], <8 x i16> zeroinitializer, <8 x i16> [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16> +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i16> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i16> [[TMP12]], [[TMP5]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[TMP13]], <8 x i16> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP4]], <8 x bfloat> [[TMP7]], <8 x bfloat> [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x bfloat> [[TMP14]] to <2 x i64> +; CHECK-NEXT: store <2 x i64> [[TMP15]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> [[TMP16]] +; +entry: + %0 = bitcast i8 %U to <8 x i1> + %1 = bitcast <2 x i64> %C to <8 x bfloat> + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4 + %3 = select <8 x i1> %0, <8 x bfloat> %2, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> + ret <2 x i64> %4 +} + +declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>) #3 + +define <8 x float> @test_mm256_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_dpbf16ps_256( +; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]]) +; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP8]] +; +entry: + %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4 + ret <8 x float> %0 +} + +define <8 x float> @test_mm256_maskz_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_maskz_dpbf16ps_256( +; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <8 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP16]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> zeroinitializer +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP17]] +; +entry: + %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer + ret <8 x float> %2 +} +define <8 x float> @test_mm256_mask_dpbf16ps_256(i8 zeroext %U, <8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <8 x float> @test_mm256_mask_dpbf16ps_256( +; CHECK-SAME: i8 zeroext [[U:%.*]], <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x float> [[E]] to <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <8 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP0]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP17]], <8 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> [[E]] +; CHECK-NEXT: store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <8 x float> [[TMP18]] +; +entry: + %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4 + %1 = bitcast i8 %U to <8 x i1> + %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %E + ret <8 x float> %2 +} + +declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>) #3 + +define <4 x float> @test_mm128_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm128_dpbf16ps_128( +; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] +; CHECK: [[BB6]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB7]]: +; CHECK-NEXT: [[TMP8:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP8]] +; +entry: + %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4 + ret <4 x float> %0 +} + +define <4 x float> @test_mm128_maskz_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B, i4 zeroext %U) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm128_maskz_dpbf16ps_128( +; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]], i4 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP16]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> zeroinitializer +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP17]] +; +entry: + %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4 + %1 = bitcast i4 %U to <4 x i1> + %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> zeroinitializer + ret <4 x float> %2 +} +define <4 x float> @test_mm128_mask_dpbf16ps_128(i4 zeroext %U, <4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory { +; CHECK-LABEL: define <4 x float> @test_mm128_mask_dpbf16ps_128( +; CHECK-SAME: i4 zeroext [[U:%.*]], <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i4, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0 +; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]] +; CHECK: [[BB7]]: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: unreachable +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1> +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x float> [[E]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i32> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP0]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP17]], <4 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> [[E]] +; CHECK-NEXT: store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <4 x float> [[TMP18]] +; +entry: + %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4 + %1 = bitcast i4 %U to <4 x i1> + %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> %E + ret <4 x float> %2 +} + +define <16 x i16> @test_no_vbroadcast1() sanitize_memory { +; CHECK-LABEL: define <16 x i16> @test_no_vbroadcast1( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <8 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i16> [[TMP2]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) + %1 = bitcast <8 x bfloat> %0 to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <16 x i32> zeroinitializer + ret <16 x i16> %2 +} + +define <16 x bfloat> @test_no_vbroadcast2() nounwind sanitize_memory { +; CHECK-LABEL: define <16 x bfloat> @test_no_vbroadcast2( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR5]] +; CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true)) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x bfloat> [[TMP0]], <8 x bfloat> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x bfloat> [[TMP1]] +; +entry: + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) + %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> poison, <16 x i32> zeroinitializer + ret <16 x bfloat> %1 +} + +define <16 x i32> @pr83358() sanitize_memory { +; CHECK-LABEL: define <16 x i32> @pr83358( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>) + %2 = bitcast <8 x bfloat> %1 to <4 x i32> + %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <16 x i32> %3 +} +;. +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. diff --git a/llvm/test/Linker/drop-attribute.ll b/llvm/test/Linker/drop-attribute.ll index 9be95a89109b4..3d4c13c2ffc75 100644 --- a/llvm/test/Linker/drop-attribute.ll +++ b/llvm/test/Linker/drop-attribute.ll @@ -39,7 +39,7 @@ define void @test_nocallback_definition() nocallback { declare void @test_nocallback_call_site() ; Test that checks that nocallback attribute on an intrinsic is NOT dropped. -; CHECK: ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn +; CHECK: ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn ; CHECK-NEXT: declare float @llvm.sqrt.f32(float) #0 declare float @llvm.sqrt.f32(float) nocallback diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt index f5cb4b72959f9..2661ed5b04cc9 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt @@ -82,12 +82,18 @@ #CHECK: lxvprll 6, 2, 1 0x7c 0xc2 0x0c 0xda +#CHECK: lxvpb32x 2, 15, 16 +0x7c,0x4f,0x86,0xda + #CHECK: stxvprl 0, 1, 2 0x7c 0x01 0x15 0x9a #CHECK: stxvprll 6, 0, 1 0x7c 0xc0 0x0d 0xda +#CHECK: stxvpb32x 2, 15, 16 +0x7c,0x4f,0x87,0xda + #CHECK: dmxvi8gerx4 1, 2, 4 0xec,0x82,0x20,0x58 diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt index f0df8ce39021b..7fb8254ced0ac 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt @@ -76,12 +76,18 @@ #CHECK: lxvprll 6, 2, 1 0xda 0x0c 0xc2 0x7c +#CHECK: lxvpb32x 2, 15, 16 +0xda,0x86,0x4f,0x7c + #CHECK: stxvprl 0, 1, 2 0x9a 0x15 0x01 0x7c #CHECK: stxvprll 6, 0, 1 0xda 0x0d 0xc0 0x7c +#CHECK: stxvpb32x 2, 15, 16 +0xda,0x87,0x4f,0x7c + #CHECK: dmxvi8gerx4 1, 2, 4 0x58,0x20,0x82,0xec diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s index bc0683e38887c..40059c440b128 100644 --- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s +++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s @@ -105,6 +105,10 @@ # CHECK-LE: lxvprll 6, 2, 1 # encoding: [0xda,0x0c,0xc2,0x7c] lxvprll 6, 2, 1 + lxvpb32x 2, 15, 16 +#CHECK-BE: lxvpb32x 2, 15, 16 # encoding: [0x7c,0x4f,0x86,0xda] +#CHECK-LE: lxvpb32x 2, 15, 16 # encoding: [0xda,0x86,0x4f,0x7c] + # CHECK-BE: stxvprl 0, 1, 2 # encoding: [0x7c,0x01,0x15,0x9a] # CHECK-LE: stxvprl 0, 1, 2 # encoding: [0x9a,0x15,0x01,0x7c] stxvprl 0, 1, 2 @@ -113,6 +117,10 @@ # CHECK-LE: stxvprll 6, 0, 1 # encoding: [0xda,0x0d,0xc0,0x7c] stxvprll 6, 0, 1 + stxvpb32x 2, 15, 16 +#CHECK-BE: stxvpb32x 2, 15, 16 # encoding: [0x7c,0x4f,0x87,0xda] +#CHECK-LE: stxvpb32x 2, 15, 16 # encoding: [0xda,0x87,0x4f,0x7c] + dmxvi8gerx4 1, 2, 4 # CHECK-BE: dmxvi8gerx4 1, 2, 4 # encoding: [0xec,0x82,0x20,0x58] # CHECK-LE: dmxvi8gerx4 1, 2, 4 # encoding: [0x58,0x20,0x82,0xec] diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index 65b96c8b8ef5d..62975a3cf8ac4 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -208,6 +208,7 @@ ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll index 3a0fffe426da1..012a1ab5802b5 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll @@ -133,6 +133,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll index 4623edcaf6656..e021ff3124b60 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -118,6 +118,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll index 590afd925e841..20f94bc2e0f6c 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -127,6 +127,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll index dd6acd2c51ee7..b61edc805108d 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll @@ -165,6 +165,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll index ee054527e20bd..acf8c053d0f82 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll @@ -167,6 +167,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll index fd95e94f3c8b9..6b3c5ca7605de 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll @@ -131,6 +131,7 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass +; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td index 475faf9254157..8648651f3d714 100644 --- a/llvm/test/TableGen/directive1.td +++ b/llvm/test/TableGen/directive1.td @@ -61,6 +61,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: #include <utility> // CHECK-EMPTY: // CHECK-NEXT: namespace llvm { +// CHECK-EMPTY: // CHECK-NEXT: namespace tdl { // CHECK-EMPTY: // CHECK-NEXT: LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); @@ -176,6 +177,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Clause> { // CHECK-NEXT: static constexpr bool is_iterable = true; // CHECK-NEXT: }; +// CHECK-EMPTY: // CHECK-NEXT: } // namespace llvm // CHECK-EMPTY: // CHECK-NEXT: #endif // LLVM_Tdl_INC diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td index ccc09446b4465..96022d7647440 100644 --- a/llvm/test/TableGen/directive2.td +++ b/llvm/test/TableGen/directive2.td @@ -54,6 +54,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: #include <utility> // CHECK-EMPTY: // CHECK-NEXT: namespace llvm { +// CHECK-EMPTY: // CHECK-NEXT: namespace tdl { // CHECK-EMPTY: // CHECK-NEXT: enum class Association { @@ -132,6 +133,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: LLVM_ABI Association getDirectiveAssociation(Directive D); // CHECK-NEXT: LLVM_ABI Category getDirectiveCategory(Directive D); // CHECK-NEXT: LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D); +// CHECK-EMPTY: // CHECK-NEXT: } // namespace tdl // CHECK-EMPTY: // CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Association> { @@ -149,6 +151,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: template <> struct enum_iteration_traits<tdl::Clause> { // CHECK-NEXT: static constexpr bool is_iterable = true; // CHECK-NEXT: }; +// CHECK-EMPTY: // CHECK-NEXT: } // namespace llvm // CHECK-EMPTY: // CHECK-NEXT: #endif // LLVM_Tdl_INC diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll index 95a52aa0f7f52..b509b2469cfdc 100644 --- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll +++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll @@ -1,8 +1,8 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -passes=atomic-expand %s | FileCheck %s ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -mattr=+outline-atomics -passes=atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS -define void @atomic_swap_f16(ptr %ptr, half %val) nounwind { +define void @atomic_swap_f16(ptr %ptr, half %val) !prof !0 { ; CHECK-LABEL: @atomic_swap_f16( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast half [[VAL:%.*]] to i16 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] @@ -12,7 +12,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind { ; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i16) [[PTR]]) ; CHECK-NEXT: [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]] +; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: atomicrmw.end: ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[TMP3]] to half ; CHECK-NEXT: ret void @@ -27,7 +27,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind { ret void } -define void @atomic_swap_f32(ptr %ptr, float %val) nounwind { +define void @atomic_swap_f32(ptr %ptr, float %val) nounwind !prof !0 { ; CHECK-LABEL: @atomic_swap_f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[VAL:%.*]] to i32 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] @@ -37,7 +37,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind { ; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i32) [[PTR]]) ; CHECK-NEXT: [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]] +; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]] ; CHECK: atomicrmw.end: ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[TMP3]] to float ; CHECK-NEXT: ret void @@ -52,7 +52,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind { ret void } -define void @atomic_swap_f64(ptr %ptr, double %val) nounwind { +define void @atomic_swap_f64(ptr %ptr, double %val) nounwind !prof !0 { ; CHECK-LABEL: @atomic_swap_f64( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double [[VAL:%.*]] to i64 ; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] @@ -60,7 +60,7 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind { ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i64) [[PTR:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP1]], ptr elementtype(i64) [[PTR]]) ; CHECK-NEXT: [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP3]], 0 -; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]] +; CHECK-NEXT: br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]] ; CHECK: atomicrmw.end: ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP2]] to double ; CHECK-NEXT: ret void @@ -74,3 +74,17 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind { %t1 = atomicrmw xchg ptr %ptr, double %val acquire ret void } + +!0 = !{!"function_entry_count", i64 1000} +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree nounwind willreturn } +;. +; OUTLINE-ATOMICS: attributes #[[ATTR0:[0-9]+]] = { "target-features"="+outline-atomics" } +; OUTLINE-ATOMICS: attributes #[[ATTR1:[0-9]+]] = { nounwind "target-features"="+outline-atomics" } +;. +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} +; CHECK: [[PROF1]] = !{!"unknown", !"atomic-expand"} +;. +; OUTLINE-ATOMICS: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} +;. diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll index 649e9467c0318..fffe50fde1e50 100644 --- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll +++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll @@ -9,15 +9,25 @@ target triple = "x86_64-unknown-linux-gnu" ; This should promote define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0:![0-9]+]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -66,15 +76,25 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -123,15 +143,25 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -180,15 +210,25 @@ bb: ; This should promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -237,13 +277,21 @@ bb: ; This should not promote define internal fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -290,13 +338,21 @@ bb: ; This should not promote define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr %arg, ptr readonly %arg1) #2 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -343,15 +399,25 @@ bb: ; This should promote define internal fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg, ptr readonly %arg1) #3 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -400,15 +466,25 @@ bb: ; This should promote define internal fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg, ptr readonly %arg1) #4 { ; -; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable -; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256 -; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 -; CHECK-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 -; CHECK-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 -; CHECK-NEXT: ret void +; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256 +; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] { +; TUNIT-NEXT: bb: +; TUNIT-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64 +; TUNIT-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; TUNIT-NEXT: ret void +; +; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable +; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256 +; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] { +; CGSCC-NEXT: bb: +; CGSCC-NEXT: [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64 +; CGSCC-NEXT: store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64 +; CGSCC-NEXT: [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]] +; CGSCC-NEXT: store <8 x i64> [[TMP]], ptr [[ARG]], align 64 +; CGSCC-NEXT: ret void ; bb: %tmp = load <8 x i64>, ptr %arg1 @@ -464,6 +540,14 @@ attributes #3 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2 attributes #4 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2" "min-legal-vector-width"="256" "prefer-vector-width"="256" } attributes #5 = { argmemonly nounwind } ;. +; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" } +; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" } +; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" } +; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" } +; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) } +; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn } +;. ; TUNIT: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" } ; TUNIT: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" } ; TUNIT: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" } @@ -472,11 +556,7 @@ attributes #5 = { argmemonly nounwind } ; TUNIT: attributes #[[ATTR5]] = { nofree willreturn memory(write) } ; TUNIT: attributes #[[ATTR6]] = { nofree nosync nounwind willreturn } ;. -; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" } -; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" } -; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" } -; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" } -; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } -; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) } -; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn } +; CGSCC: [[META0]] = !{} ;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/Attributor/align-ptrmask.ll b/llvm/test/Transforms/Attributor/align-ptrmask.ll new file mode 100644 index 0000000000000..008f5e1b8a46e --- /dev/null +++ b/llvm/test/Transforms/Attributor/align-ptrmask.ll @@ -0,0 +1,206 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=attributor -S < %s | FileCheck %s + +define ptr @align_ptrmask_back_no_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_no_prop( +; CHECK-SAME: ptr nofree writeonly align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 16 dereferenceable(4) ptr @align_ptrmask_back_prop( +; CHECK-SAME: ptr nofree writeonly align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 16 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 16 +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + store float 1.0, ptr %p, align 16 + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask(ptr align 2 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_mask( +; CHECK-SAME: ptr nofree readnone align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + ret ptr %p +} + +define ptr @align_ptrmask_forward_ptr(ptr align 16 %x, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 16 ptr @align_ptrmask_forward_ptr( +; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 -8 + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + ret ptr %p +} + +define ptr @align_ptrmask_forward_nonconst_mask(ptr align 8 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_nonconst_mask( +; CHECK-SAME: ptr nofree readnone align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]] +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 %y + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + ret ptr %p +} + +define ptr @align_ptrmask_back_nonconst_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_nonconst_mask( +; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]] +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16 +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %sel = select i1 %cmp1, i64 -32, i64 %y + %sel1 = select i1 %cmp2, i64 %sel, i64 -16 + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_const_back_noprop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_noprop( +; CHECK-SAME: ptr nofree writeonly align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_const_back_prop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_prop( +; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -2) #[[ATTR4]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[P]], align 8 +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -2) + store float 1.0, ptr %p, align 8 + ret ptr %p +} + +define ptr @align_ptrmask_back_const_forward_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 8 ptr @align_ptrmask_back_const_forward_mask( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8) + ret ptr %p +} + +define ptr @align_ptrmask_back_const_forward_ptr(ptr align 16 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_const_forward_ptr( +; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8) + ret ptr %p +} + +; FIXME: The store will create AAAlign for %ptr1, +; but the attribute didn't propagate through extractelement, need propagate +define <2 x ptr> @ptrmask_v2p0_v2i64(<2 x ptr> align 2 %ptr, i64 %a) { +; CHECK-LABEL: define <2 x ptr> @ptrmask_v2p0_v2i64( +; CHECK-SAME: <2 x ptr> align 2 [[PTR:%.*]], i64 [[A:%.*]]) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[RESULT:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PTR]], <2 x i64> noundef splat (i64 -8)) #[[ATTR4]] +; CHECK-NEXT: [[PTR1:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 0 +; CHECK-NEXT: [[PTR2:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 1 +; CHECK-NEXT: store i64 [[A]], ptr [[PTR1]], align 16 +; CHECK-NEXT: store i64 [[A]], ptr [[PTR2]], align 16 +; CHECK-NEXT: ret <2 x ptr> [[RESULT]] +; + %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> splat(i64 -8)) + %ptr1 = extractelement <2 x ptr> %result, i32 0 + %ptr2 = extractelement <2 x ptr> %result, i32 1 + store i64 %a, ptr %ptr1, align 16 + store i64 %a, ptr %ptr2, align 16 + ret <2 x ptr> %result +} + +define ptr @align_ptrmask_forward_mask_positive(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_positive( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef 2) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 2) + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask_poison(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_poison( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 poison) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 poison) + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask_max(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4294967296) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4294967296) + ret ptr %p +} + +define ptr @align_ptrmask_forward_mask_max_plus_one(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max_plus_one( +; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8589934592) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8589934592) + ret ptr %p +} + +define ptr @align_ptrmask_back_callsite(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) { +; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_callsite( +; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4) #[[ATTR4]] +; CHECK-NEXT: ret ptr [[P]] +; + %p = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4) + ret ptr %p +} diff --git a/llvm/test/Transforms/Attributor/nofree.ll b/llvm/test/Transforms/Attributor/nofree.ll index 2a9d5d91ae053..94aa79aa327f4 100644 --- a/llvm/test/Transforms/Attributor/nofree.ll +++ b/llvm/test/Transforms/Attributor/nofree.ll @@ -238,7 +238,7 @@ define void @call_both() #0 { ; TEST 10 (positive case) ; Call intrinsic function -; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.floor.f32(float) define void @call_floor(float %a) #0 { @@ -489,7 +489,7 @@ attributes #2 = { nobuiltin nounwind } ; TUNIT: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable } ; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable } ; TUNIT: attributes #[[ATTR5:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable } -; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; TUNIT: attributes #[[ATTR7]] = { nofree nounwind } ; TUNIT: attributes #[[ATTR8]] = { nobuiltin nofree nounwind } ; TUNIT: attributes #[[ATTR9]] = { nosync memory(none) } @@ -506,7 +506,7 @@ attributes #2 = { nobuiltin nounwind } ; CGSCC: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable } ; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable } ; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable } -; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CGSCC: attributes #[[ATTR7]] = { nofree nounwind } ; CGSCC: attributes #[[ATTR8]] = { nobuiltin nofree nounwind } ; CGSCC: attributes #[[ATTR9]] = { nosync memory(none) } diff --git a/llvm/test/Transforms/Attributor/nosync.ll b/llvm/test/Transforms/Attributor/nosync.ll index 7ef46e8e94c9e..c15bd775ddb4d 100644 --- a/llvm/test/Transforms/Attributor/nosync.ll +++ b/llvm/test/Transforms/Attributor/nosync.ll @@ -454,7 +454,7 @@ define void @nosync_convergent_callee_test() { ; CHECK: attributes #[[ATTR14:[0-9]+]] = { convergent memory(none) } ; CHECK: attributes #[[ATTR15]] = { memory(none) } ; CHECK: attributes #[[ATTR16]] = { nounwind } -; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR18]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } ; CHECK: attributes #[[ATTR19]] = { nosync memory(none) } ; CHECK: attributes #[[ATTR20]] = { nofree nounwind } diff --git a/llvm/test/Transforms/Attributor/willreturn.ll b/llvm/test/Transforms/Attributor/willreturn.ll index b7ac7fc2970b0..d65480b05759a 100644 --- a/llvm/test/Transforms/Attributor/willreturn.ll +++ b/llvm/test/Transforms/Attributor/willreturn.ll @@ -276,7 +276,7 @@ define void @conditional_exit(i32 %0, ptr nocapture readonly %1) local_unnamed_a ; TEST 6 (positive case) ; Call intrinsic function -; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.floor.f32(float) define void @call_floor(float %a) #0 { @@ -425,7 +425,7 @@ define i32 @loop_constant_trip_count(ptr nocapture readonly %0) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = phi i64 [ 0, [[TMP1:%.*]] ], [ [[TMP9:%.*]], [[TMP3]] ] ; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ 0, [[TMP1]] ], [ [[TMP8]], [[TMP3]] ] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !invariant.load [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP8]] = add nsw i32 [[TMP7]], [[TMP5]] ; CHECK-NEXT: [[TMP9]] = add nuw nsw i64 [[TMP4]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 10 @@ -472,7 +472,7 @@ define i32 @loop_trip_count_unbound(i32 %0, i32 %1, ptr nocapture readonly %2, i ; CHECK-NEXT: [[TMP10:%.*]] = phi i32 [ [[TMP14]], [[TMP8]] ], [ 0, [[TMP4]] ] ; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !invariant.load [[META0]] ; CHECK-NEXT: [[TMP14]] = add nsw i32 [[TMP13]], [[TMP10]] ; CHECK-NEXT: [[TMP15]] = add i32 [[TMP9]], [[TMP3]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]] @@ -522,7 +522,7 @@ define i32 @loop_trip_dec(i32 %0, ptr nocapture readonly %1) local_unnamed_addr ; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ [[TMP5]], [[TMP4]] ], [ [[TMP12:%.*]], [[TMP6]] ] ; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ 0, [[TMP4]] ], [ [[TMP11:%.*]], [[TMP6]] ] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !invariant.load [[META0]] ; CHECK-NEXT: [[TMP11]] = add nsw i32 [[TMP10]], [[TMP8]] ; CHECK-NEXT: [[TMP12]] = add nsw i64 [[TMP7]], -1 ; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt i64 [[TMP7]], 0 @@ -1294,7 +1294,7 @@ attributes #1 = { uwtable noinline } ; TUNIT: attributes #[[ATTR5]] = { noreturn } ; TUNIT: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable } ; TUNIT: attributes #[[ATTR7]] = { noinline nounwind uwtable } -; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; TUNIT: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn } ; TUNIT: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable } ; TUNIT: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable } @@ -1332,7 +1332,7 @@ attributes #1 = { uwtable noinline } ; CGSCC: attributes #[[ATTR5]] = { noreturn } ; CGSCC: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable } ; CGSCC: attributes #[[ATTR7]] = { noinline nounwind uwtable } -; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CGSCC: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn } ; CGSCC: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable } ; CGSCC: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable } @@ -1364,3 +1364,7 @@ attributes #1 = { uwtable noinline } ; CGSCC: attributes #[[ATTR37]] = { nosync willreturn memory(read) } ; CGSCC: attributes #[[ATTR38]] = { willreturn memory(read) } ;. +; TUNIT: [[META0]] = !{} +;. +; CGSCC: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll index cf871e5714bf5..1dbffd962a638 100644 --- a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll +++ b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll @@ -262,7 +262,7 @@ define i32 @commutative_intrinsic_intersection_failure(i32 %arg, i32 %arg1) { } ;. -; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR1]] = { memory(none) } ; CHECK: attributes #[[ATTR2]] = { memory(read) } ; CHECK: attributes #[[ATTR3]] = { alwaysinline memory(none) } diff --git a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll index 9e049837b0352..f0d4ad74fbe05 100644 --- a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll +++ b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll @@ -653,12 +653,11 @@ define i8 @mul_const_incoming0_speculatable(i1 %b, i8 %x, i8 %y) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[B:%.*]], label [[IF:%.*]], label [[THEN:%.*]] ; CHECK: if: +; CHECK-NEXT: [[TMP0:%.*]] = mul i8 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: br label [[THEN]] ; CHECK: then: -; CHECK-NEXT: [[P0:%.*]] = phi i8 [ 42, [[ENTRY:%.*]] ], [ [[X:%.*]], [[IF]] ] -; CHECK-NEXT: [[P1:%.*]] = phi i8 [ 17, [[ENTRY]] ], [ [[Y:%.*]], [[IF]] ] +; CHECK-NEXT: [[R:%.*]] = phi i8 [ -54, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF]] ] ; CHECK-NEXT: call void @sideeffect() -; CHECK-NEXT: [[R:%.*]] = mul i8 [[P0]], [[P1]] ; CHECK-NEXT: ret i8 [[R]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/binop-select.ll b/llvm/test/Transforms/InstCombine/binop-select.ll index 25f624ee13412..fe1ec9014f188 100644 --- a/llvm/test/Transforms/InstCombine/binop-select.ll +++ b/llvm/test/Transforms/InstCombine/binop-select.ll @@ -335,7 +335,7 @@ define i32 @sub_sel_op1_use(i1 %b) { define float @fadd_sel_op0(i1 %b, float %x) { ; CHECK-LABEL: @fadd_sel_op0( -; CHECK-NEXT: [[R:%.*]] = select nnan i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000 +; CHECK-NEXT: [[R:%.*]] = select i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000 ; CHECK-NEXT: ret float [[R]] ; %s = select i1 %b, float 0xFFF0000000000000, float 0x7FF0000000000000 @@ -403,3 +403,188 @@ define i32 @ashr_sel_op1_use(i1 %b) { %r = ashr i32 -2, %s ret i32 %r } + +define i8 @commonArgWithOr0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithOr0( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 0, i8 8 +; CHECK-NEXT: [[V2:%.*]] = or disjoint i8 [[V1]], [[V0]] +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 0, i8 8 + %v2 = or i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithOr1(i1 %arg0) { +; CHECK-LABEL: @commonArgWithOr1( +; CHECK-NEXT: [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 17, i8 23 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 1, i8 7 + %v2 = or i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithOr2(i1 %arg0) { +; CHECK-LABEL: @commonArgWithOr2( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 5, i8 42 +; CHECK-NEXT: [[V2:%.*]] = or i8 [[V1]], [[V0]] +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 42 + %v2 = or i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd0( +; CHECK-NEXT: ret i8 16 +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 0, i8 8 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd1(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd1( +; CHECK-NEXT: ret i8 16 +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 8, i8 1 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd2(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd2( +; CHECK-NEXT: [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 1, i8 7 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAnd3(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAnd3( +; CHECK-NEXT: [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 42 + %v2 = and i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithXor0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor0( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 0, i8 8 +; CHECK-NEXT: [[V2:%.*]] = or disjoint i8 [[V1]], [[V0]] +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 0, i8 8 + %v2 = xor i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithXor1(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor1( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 9, i8 1 +; CHECK-NEXT: [[V2:%.*]] = xor i8 [[V1]], [[V0]] +; CHECK-NEXT: ret i8 [[V2]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 9, i8 1 + %v2 = xor i8 %v1, %v0 + ret i8 %v2 +} + +define i8 @commonArgWithXor2(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor2( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 1, i8 7 +; CHECK-NEXT: [[V2:%.*]] = xor i8 [[V1]], [[V0]] +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 1, i8 7 + %v2 = xor i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithXor3(i1 %arg0) { +; CHECK-LABEL: @commonArgWithXor3( +; CHECK-NEXT: [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8 +; CHECK-NEXT: [[V1:%.*]] = select i1 [[ARG0]], i8 5, i8 45 +; CHECK-NEXT: [[V2:%.*]] = xor i8 [[V1]], [[V0]] +; CHECK-NEXT: [[V3:%.*]] = or disjoint i8 [[V2]], 16 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 45 + %v2 = xor i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i8 @commonArgWithAdd0(i1 %arg0) { +; CHECK-LABEL: @commonArgWithAdd0( +; CHECK-NEXT: [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 22, i8 61 +; CHECK-NEXT: ret i8 [[V3]] +; + %v0 = zext i1 %arg0 to i8 + %v1 = select i1 %arg0, i8 21, i8 45 + %v2 = add i8 %v1, %v0 + %v3 = or i8 %v2, 16 + ret i8 %v3 +} + +define i32 @OrSelectIcmpZero(i32 %a, i32 %b) { +; CHECK-LABEL: @OrSelectIcmpZero( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[OR:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %a + ret i32 %or +} + +define i32 @OrSelectIcmpNonZero(i32 %a, i32 %b) { +; CHECK-LABEL: @OrSelectIcmpNonZero( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 42 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[A]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 42 + %or = or i32 %sel, %a + ret i32 %or +} diff --git a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll index 45e47d8e781be..5e90d4b8d4419 100644 --- a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll +++ b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll @@ -7,7 +7,7 @@ define zeroext i1 @foo(i32 %arg) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37 ; CHECK-NEXT: br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]] ; CHECK: bb_then: ; CHECK-NEXT: call void @bar() @@ -16,8 +16,7 @@ define zeroext i1 @foo(i32 %arg) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17 ; CHECK-NEXT: br label [[BB_EXIT]] ; CHECK: bb_exit: -; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ] -; CHECK-NEXT: [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]] +; CHECK-NEXT: [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ] ; CHECK-NEXT: ret i1 [[AND1]] ; @@ -43,7 +42,7 @@ bb_exit: define zeroext i1 @foo_logical(i32 %arg) { ; CHECK-LABEL: @foo_logical( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37 ; CHECK-NEXT: br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]] ; CHECK: bb_then: ; CHECK-NEXT: call void @bar() @@ -52,8 +51,7 @@ define zeroext i1 @foo_logical(i32 %arg) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17 ; CHECK-NEXT: br label [[BB_EXIT]] ; CHECK: bb_exit: -; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ] -; CHECK-NEXT: [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]] +; CHECK-NEXT: [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ] ; CHECK-NEXT: ret i1 [[AND1]] ; diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll index cd4a8e36c6e23..3cbf7090a13b8 100644 --- a/llvm/test/Transforms/InstCombine/fmul.ll +++ b/llvm/test/Transforms/InstCombine/fmul.ll @@ -1222,7 +1222,7 @@ define <2 x double> @negate_if_true_wrong_constant(<2 x double> %px, i1 %cond) { ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0 define float @fmul_select(float %x, i1 %c) { ; CHECK-LABEL: @fmul_select( -; CHECK-NEXT: [[MUL:%.*]] = select fast i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00 +; CHECK-NEXT: [[MUL:%.*]] = select i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00 ; CHECK-NEXT: ret float [[MUL]] ; %sel = select i1 %c, float 1.0, float 0.0 @@ -1233,7 +1233,7 @@ define float @fmul_select(float %x, i1 %c) { ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0 define <2 x float> @fmul_select_vec(<2 x float> %x, i1 %c) { ; CHECK-LABEL: @fmul_select_vec( -; CHECK-NEXT: [[MUL:%.*]] = select fast i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer +; CHECK-NEXT: [[MUL:%.*]] = select i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer ; CHECK-NEXT: ret <2 x float> [[MUL]] ; %sel = select i1 %c, <2 x float> <float 1.0, float 1.0>, <2 x float> zeroinitializer diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll index 4b69a5e77b4ce..2e8e75c3ab3ef 100644 --- a/llvm/test/Transforms/InstCombine/free-inversion.ll +++ b/llvm/test/Transforms/InstCombine/free-inversion.ll @@ -563,10 +563,10 @@ define i1 @test_inv_free(i1 %c1, i1 %c2, i1 %c3, i1 %c4) { ; CHECK: b2: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: b3: +; CHECK-NEXT: [[TMP0:%.*]] = and i1 [[C3:%.*]], [[C4:%.*]] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[VAL_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ true, [[B2]] ], [ [[C3:%.*]], [[B3]] ] -; CHECK-NEXT: [[COND_NOT:%.*]] = and i1 [[VAL_NOT]], [[C4:%.*]] +; CHECK-NEXT: [[COND_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ [[C4]], [[B2]] ], [ [[TMP0]], [[B3]] ] ; CHECK-NEXT: br i1 [[COND_NOT]], label [[B5:%.*]], label [[B4:%.*]] ; CHECK: b4: ; CHECK-NEXT: ret i1 true diff --git a/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll new file mode 100644 index 0000000000000..a3b21ccc63e94 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll @@ -0,0 +1,169 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +; Basic functional test +define i32 @basic(i32 %a, i32 %b) { +; CHECK-LABEL: @basic( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]] +; CHECK-NEXT: ret i32 [[RES]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %a + ret i32 %or +} + +; Operand order swap test +define i32 @swap_operand_order(i32 %x, i32 %y) { +; CHECK-LABEL: @swap_operand_order( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0 +; CHECK-NEXT: [[RES:%.*]] = or i32 [[X]], [[SEL]] +; CHECK-NEXT: ret i32 [[RES]] +; + %cmp = icmp eq i32 %x, 0 + %sel = select i1 %cmp, i32 %y, i32 0 + %or = or i32 %x, %sel + ret i32 %or +} + +; Negative test: Non-zero false value in select +define i32 @negative_non_zero_false_val(i32 %a, i32 %b) { +; CHECK-LABEL: @negative_non_zero_false_val( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 1 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[A]] +; CHECK-NEXT: ret i32 [[OR]] +; + %cmp = icmp eq i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 1 + %or = or i32 %sel, %a + ret i32 %or +} + +; Negative test: Incorrect comparison predicate (NE) +define i32 @negative_wrong_predicate(i32 %a, i32 %b) { +; CHECK-LABEL: @negative_wrong_predicate( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0 +; CHECK-NEXT: [[OR:%.*]] = select i1 [[CMP]], i32 0, i32 [[TMP1:%.*]] +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[OR]], [[A]] +; CHECK-NEXT: ret i32 [[OR1]] +; + %cmp = icmp ne i32 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %a + ret i32 %or +} + +; Comparison direction swap test (0 == X) +define i32 @cmp_swapped(i32 %x, i32 %y) { +; CHECK-LABEL: @cmp_swapped( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0 +; CHECK-NEXT: [[RES:%.*]] = or i32 [[X]], [[SEL]] +; CHECK-NEXT: ret i32 [[RES]] +; + %cmp = icmp eq i32 0, %x + %sel = select i1 %cmp, i32 %y, i32 0 + %or = or i32 %x, %sel + ret i32 %or +} + +; Complex expression test +define i32 @complex_expression(i32 %a, i32 %b) { +; CHECK-LABEL: @complex_expression( +; CHECK-NEXT: [[X:%.*]] = add i32 [[A:%.*]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], 0 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[X]] +; CHECK-NEXT: ret i32 [[RES]] +; + %x = add i32 %a, 1 + %cmp = icmp eq i32 %x, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %x + ret i32 %or +} + +; zext test +define i32 @zext_cond(i8 %a, i32 %b) { +; CHECK-LABEL: @zext_cond( +; CHECK-NEXT: [[Z:%.*]] = zext i8 [[A:%.*]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[A]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[Z]] +; CHECK-NEXT: ret i32 [[OR]] +; + %z = zext i8 %a to i32 + %cmp = icmp eq i8 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %z + ret i32 %or +} + +; sext test +define i32 @sext_cond(i8 %a, i32 %b) { +; CHECK-LABEL: @sext_cond( +; CHECK-NEXT: [[S:%.*]] = sext i8 [[A:%.*]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[A]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[S]] +; CHECK-NEXT: ret i32 [[OR]] +; + %s = sext i8 %a to i32 + %cmp = icmp eq i8 %a, 0 + %sel = select i1 %cmp, i32 %b, i32 0 + %or = or i32 %sel, %s + ret i32 %or +} + +; Vector type test +define <2 x i32> @vector_type(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: @vector_type( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[A:%.*]], zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[B:%.*]], <2 x i32> [[A]] +; CHECK-NEXT: ret <2 x i32> [[RES]] +; + %cmp = icmp eq <2 x i32> %a, zeroinitializer + %sel = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> zeroinitializer + %or = or <2 x i32> %sel, %a + ret <2 x i32> %or +} + +; Pointer type test (should not trigger optimization) +define ptr @pointer_type(ptr %p, ptr %q) { +; CHECK-LABEL: @pointer_type( +; CHECK-NEXT: [[A:%.*]] = ptrtoint ptr [[P:%.*]] to i64 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], null +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], ptr [[Q:%.*]], ptr null +; CHECK-NEXT: [[SEL_INT:%.*]] = ptrtoint ptr [[SEL]] to i64 +; CHECK-NEXT: [[OR:%.*]] = or i64 [[A]], [[SEL_INT]] +; CHECK-NEXT: [[RET:%.*]] = inttoptr i64 [[OR]] to ptr +; CHECK-NEXT: ret ptr [[RET]] +; + %a = ptrtoint ptr %p to i64 + %cmp = icmp eq i64 %a, 0 + %sel = select i1 %cmp, ptr %q, ptr null + %sel_int = ptrtoint ptr %sel to i64 + %or_val = or i64 %a, %sel_int + %ret = inttoptr i64 %or_val to ptr + ret ptr %ret +} + +; Multi-use test (should not trigger optimization) +define i32 @multi_use_test(i32 %x, i32 %m) { +; CHECK-LABEL: @multi_use_test( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i32 [[M:%.*]], i32 0 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SEL]], [[X]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SEL]], [[X]] +; CHECK-NEXT: [[O2:%.*]] = sub i32 [[OR]], [[ADD]] +; CHECK-NEXT: ret i32 [[O2]] +; + %cmp = icmp eq i32 %x, 0 + %sel = select i1 %cmp, i32 %m, i32 0 + %or = or i32 %sel, %x + %add = add i32 %sel, %x + %res = sub i32 %or, %add + ret i32 %res +} diff --git a/llvm/test/Transforms/InstCombine/recurrence.ll b/llvm/test/Transforms/InstCombine/recurrence.ll index f75e0d439c572..643e7efc243a3 100644 --- a/llvm/test/Transforms/InstCombine/recurrence.ll +++ b/llvm/test/Transforms/InstCombine/recurrence.ll @@ -24,9 +24,9 @@ loop: ; preds = %loop, %entry define i64 @test_or2(i64 %a, i64 %b) { ; CHECK-LABEL: @test_or2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: tail call void @use(i64 [[IV_NEXT]]) ; CHECK-NEXT: br label [[LOOP]] ; @@ -104,9 +104,9 @@ loop: ; preds = %loop, %entry define i64 @test_and2(i64 %a, i64 %b) { ; CHECK-LABEL: @test_and2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: tail call void @use(i64 [[IV_NEXT]]) ; CHECK-NEXT: br label [[LOOP]] ; diff --git a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll index 3d97048f43127..8b3c0502ac04d 100644 --- a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll +++ b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll @@ -256,27 +256,27 @@ define <2 x i1> @not_logical_or2(i1 %b, <2 x i32> %a) { ret <2 x i1> %and } -define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) { +define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools_logical_commute0( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF2]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 - %and1 = select i1 %not, i1 %a, i1 false - %and2 = select i1 %c, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and1 = select i1 %not, i1 %a, i1 false, !prof!1 + %and2 = select i1 %c, i1 %b, i1 false, !prof !2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !3 ret i1 %or } -define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) { +define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools_logical_commute0_and1( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF1]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %not, %a - %and2 = select i1 %c, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and2 = select i1 %c, i1 %b, i1 false, !prof !1 + %or = select i1 %and1, i1 true, i1 %and2, !prof !2 ret i1 %or } @@ -292,15 +292,15 @@ define i1 @bools_logical_commute0_and2(i1 %a, i1 %b, i1 %c) { ret i1 %or } -define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) { +define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools_logical_commute0_and1_and2( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF3:![0-9]+]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %not, %a %and2 = and i1 %c, %b - %or = select i1 %and1, i1 true, i1 %and2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !1 ret i1 %or } @@ -457,27 +457,27 @@ define i1 @bools_logical_commute3_and1_and2(i1 %b, i1 %c) { ret i1 %or } -define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) { +define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools2_logical_commute0( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF1]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 - %and1 = select i1 %c, i1 %a, i1 false - %and2 = select i1 %not, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and1 = select i1 %c, i1 %a, i1 false, !prof !1 + %and2 = select i1 %not, i1 %b, i1 false, !prof !2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !3 ret i1 %or } -define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) { +define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools2_logical_commute0_and1( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF2]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %c, %a - %and2 = select i1 %not, i1 %b, i1 false - %or = select i1 %and1, i1 true, i1 %and2 + %and2 = select i1 %not, i1 %b, i1 false, !prof !1 + %or = select i1 %and1, i1 true, i1 %and2, !prof !2 ret i1 %or } @@ -493,15 +493,15 @@ define i1 @bools2_logical_commute0_and2(i1 %a, i1 %b, i1 %c) { ret i1 %or } -define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) { +define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 { ; CHECK-LABEL: @bools2_logical_commute0_and1_and2( -; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]] +; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF3]] ; CHECK-NEXT: ret i1 [[OR]] ; %not = xor i1 %c, -1 %and1 = and i1 %c, %a %and2 = and i1 %not, %b - %or = select i1 %and1, i1 true, i1 %and2 + %or = select i1 %and1, i1 true, i1 %and2, !prof !1 ret i1 %or } @@ -799,8 +799,11 @@ define <2 x i1> @not_logical_and2(i1 %b, <2 x i32> %a) { !0 = !{!"function_entry_count", i64 1000} !1 = !{!"branch_weights", i32 2, i32 3} +!2 = !{!"branch_weights", i32 5, i32 7} +!3 = !{!"branch_weights", i32 11, i32 13} ;. ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} ; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3} ; CHECK: [[PROF2]] = !{!"branch_weights", i32 3, i32 2} +; CHECK: [[PROF3]] = !{!"unknown", !"instcombine"} ;. diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll index ee70137e8fbd7..01da63fa5b0af 100644 --- a/llvm/test/Transforms/InstCombine/sub-gep.ll +++ b/llvm/test/Transforms/InstCombine/sub-gep.ll @@ -858,8 +858,7 @@ define i1 @_gep_phi2(ptr %str1, i64 %val2) { ; CHECK: while.end.i: ; CHECK-NEXT: br label [[_Z3FOOPKC_EXIT]] ; CHECK: _Z3fooPKc.exit: -; CHECK-NEXT: [[RETVAL_0_I:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ 0, [[LOR_LHS_FALSE_I]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[RETVAL_0_I]], [[VAL2:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ [[VAL2:%.*]], [[LOR_LHS_FALSE_I]] ], [ [[VAL2]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[TOBOOL]] ; diff --git a/llvm/test/Transforms/LoopFusion/pr164082.ll b/llvm/test/Transforms/LoopFusion/pr164082.ll new file mode 100644 index 0000000000000..652557cef48f8 --- /dev/null +++ b/llvm/test/Transforms/LoopFusion/pr164082.ll @@ -0,0 +1,65 @@ +; REQUIRES: asserts +; RUN: opt -passes=loop-fusion -disable-output -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s +; STAT: 1 loop-fusion - Loops fused + +; C Code +; +;; for (int i = 0; i < 100; ++i) +;; Array[i][i] = -i; +;; for (int row = 0; row < 100; ++row) +;; for (int col = 0; col < 100; ++col) +;; if (col != row) +;; Array[row][col] = row + col; +; +; Loop fusion should not crash anymore as now forgetBlockAndLoopDispositions() +; is trigerred after mergeLatch() during the fusion. + +define i32 @forget_dispositions() nounwind { +entry: + %Array = alloca [100 x [100 x i32]], align 4 + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv33 = phi i64 [ 0, %entry ], [ %indvars.iv.next34, %for.body ] + %0 = trunc i64 %indvars.iv33 to i32 + %sub = sub i32 0, %0 + %arrayidx2 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv33, i64 %indvars.iv33 + store i32 %sub, ptr %arrayidx2, align 4 + %indvars.iv.next34 = add i64 %indvars.iv33, 1 + %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32 + %exitcond36 = icmp eq i32 %lftr.wideiv35, 100 + br i1 %exitcond36, label %for.cond6.preheader, label %for.body + +for.cond6.preheader: ; preds = %for.body, %for.inc17 + %indvars.iv29 = phi i64 [ %indvars.iv.next30, %for.inc17 ], [ 0, %for.body ] + br label %for.body8 + +for.body8: ; preds = %for.inc14, %for.cond6.preheader + %indvars.iv = phi i64 [ 0, %for.cond6.preheader ], [ %indvars.iv.next, %for.inc14 ] + %1 = trunc i64 %indvars.iv to i32 + %2 = trunc i64 %indvars.iv29 to i32 + %cmp9 = icmp eq i32 %1, %2 + br i1 %cmp9, label %for.inc14, label %if.then + +if.then: ; preds = %for.body8 + %3 = add i64 %indvars.iv, %indvars.iv29 + %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv29, i64 %indvars.iv + %4 = trunc i64 %3 to i32 + store i32 %4, ptr %arrayidx13, align 4 + br label %for.inc14 + +for.inc14: ; preds = %for.body8, %if.then + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv27 = trunc i64 %indvars.iv.next to i32 + %exitcond28 = icmp eq i32 %lftr.wideiv27, 100 + br i1 %exitcond28, label %for.inc17, label %for.body8 + +for.inc17: ; preds = %for.inc14 + %indvars.iv.next30 = add i64 %indvars.iv29, 1 + %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32 + %exitcond32 = icmp eq i32 %lftr.wideiv31, 100 + br i1 %exitcond32, label %for.exit, label %for.cond6.preheader + +for.exit: ; preds = %for.inc17 + ret i32 0 +} diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll index e8ea912246728..9deccf5352ea8 100644 --- a/llvm/test/Transforms/LoopIdiom/basic.ll +++ b/llvm/test/Transforms/LoopIdiom/basic.ll @@ -1620,5 +1620,5 @@ define noalias ptr @_ZN8CMSPULog9beginImplEja(ptr nocapture writeonly %0) local_ ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } -; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll index 7f266a754d1bc..314cf38baae04 100644 --- a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll +++ b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll @@ -85,6 +85,35 @@ for.body: ; preds = %for.body.preheader, br i1 %exitcond, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !3 } +; LOOP-UNROLL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body +; LOOP-UNROLL-NEXT: Loop Size = 4 +; LOOP-UNROLL-NEXT: Exiting block %for.body: TripCount=0, TripMultiple=1, BreakoutTrip=1 +; LOOP-UNROLL-NEXT: Trying runtime unrolling on Loop: +; LOOP-UNROLL-NEXT: Loop at depth 1 containing: %for.body<header><exiting>,%for.cond<latch> +; LOOP-UNROLL-NEXT: Using epilog remainder. +; LOOP-UNROLL-NEXT: Loop latch not terminated by a conditional branch. +; LOOP-UNROLL-NEXT: UNROLLING loop %for.body by 5! + +; LOOP-UNROLL-FULL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body +; LOOP-UNROLL-FULL-NEXT: Loop Size = 4 +; LOOP-UNROLL-FULL-NEXT: Not attempting partial/runtime unroll in FullLoopUnroll +define void @pragma_unroll_count2(i64 %n) { +entry: + br label %for.body + +for.body: ; preds = %for.cond, %entry + %i = phi i64 [ 0, %entry ], [ %inc, %for.cond ] + %cmp = icmp ult i64 %i, %n + br i1 %cmp, label %for.cond, label %for.cond.cleanup + +for.cond: ; preds = %for.body + %inc = add i64 %i, 8 + br label %for.body, !llvm.loop !3 + +for.cond.cleanup: ; preds = %for.body + ret void +} + ; LOOP-UNROLL: llvm.loop.unroll.disable ; LOOP-UNROLL-FULL: llvm.loop.unroll.enable !0 = !{!"llvm.loop.unroll.enable"} diff --git a/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll new file mode 100644 index 0000000000000..14f6da42df6b1 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll @@ -0,0 +1,116 @@ +; Check that a loop probability of one (indicating an always infinite loop) does +; not crash or otherwise break LoopUnroll behavior when it tries to compute new +; probabilities from it. +; +; That case indicates an always infinite loop. A remainder loop cannot be +; calculated at run time when the original loop is infinite as infinity % +; UnrollCount is undefined, so consistent remainder loop probabilities are +; difficult or impossible to reason about. The implementation chooses +; probabilities indicating that all remainder loop iterations will always +; execute. + +; DEFINE: %{unroll} = opt < %s -unroll-count=3 -passes=loop-unroll -S +; DEFINE: %{rt} = %{unroll} -unroll-runtime + +; RUN: %{unroll} | FileCheck %s -check-prefix UNROLL +; RUN: %{rt} -unroll-runtime-epilog=true | FileCheck %s -check-prefix EPILOG +; RUN: %{rt} -unroll-runtime-epilog=false | FileCheck %s -check-prefix PROLOG + +define void @test(i32 %n) { +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %inc, %loop ] + %inc = add i32 %i, 1 + %c = icmp slt i32 %inc, %n + br i1 %c, label %loop, label %end, !prof !0 + +end: + ret void +} + + +!0 = !{!"branch_weights", i32 1, i32 0} + +; UNROLL: define void @test(i32 %n) { +; UNROLL: entry: +; UNROLL: br label %loop +; UNROLL: loop: +; UNROLL: br i1 %c, label %loop.1, label %end, !prof !0 +; UNROLL: loop.1: +; UNROLL: br i1 %c.1, label %loop.2, label %end, !prof !0 +; UNROLL: loop.2: +; UNROLL: br i1 %c.2, label %loop, label %end, !prof !0, !llvm.loop !1 +; UNROLL-NOT: loop.3 +; UNROLL: end: +; UNROLL: ret void +; UNROLL: } +; +; Infinite unrolled loop. +; UNROLL: !0 = !{!"branch_weights", i32 1, i32 0} + +; EPILOG: define void @test(i32 %n) { +; EPILOG: entry: +; EPILOG: br i1 %{{.*}}, label %loop.epil.preheader, label %entry.new, !prof !0 +; EPILOG: entry.new: +; EPILOG: br label %loop +; EPILOG: loop: +; EPILOG: br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !1 +; EPILOG: end.unr-lcssa: +; EPILOG: br i1 %{{.*}}, label %loop.epil.preheader, label %end, !prof !1 +; EPILOG: loop.epil.preheader: +; EPILOG: br label %loop.epil +; EPILOG: loop.epil: +; EPILOG: br i1 %{{.*}}, label %loop.epil, label %end.epilog-lcssa, !prof !4 +; EPILOG: end.epilog-lcssa: +; EPILOG: br label %end +; EPILOG: end: +; EPILOG: ret void +; EPILOG: } +; +; Unrolled loop guard: Unrolled loop is always entered. +; EPILOG: !0 = !{!"branch_weights", i32 0, i32 -2147483648} +; +; Unrolled loop latch: Unrolled loop is infinite. +; Epilogue loop guard: Epilogue loop is always entered if unrolled loop exits. +; EPILOG: !1 = !{!"branch_weights", i32 -2147483648, i32 0} +; +; Epilogue loop latch: Epilogue loop executes both of its 2 iterations. +; EPILOG: !4 = !{!"branch_weights", i32 1073741824, i32 1073741824} + +; PROLOG: define void @test(i32 %n) { +; PROLOG: entry: +; PROLOG: br i1 %{{.*}}, label %loop.prol.preheader, label %loop.prol.loopexit, !prof !0 +; PROLOG: loop.prol.preheader: +; PROLOG: br label %loop.prol +; PROLOG: loop.prol: +; PROLOG: br i1 %{{.*}}, label %loop.prol, label %loop.prol.loopexit.unr-lcssa, !prof !1 +; PROLOG: loop.prol.loopexit.unr-lcssa: +; PROLOG: br label %loop.prol.loopexit +; PROLOG: loop.prol.loopexit: +; PROLOG: br i1 %{{.*}}, label %end, label %entry.new, !prof !0 +; PROLOG: entry.new: +; PROLOG: br label %loop +; PROLOG: loop: +; PROLOG: br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !4 +; PROLOG: end.unr-lcssa: +; PROLOG: br label %end +; PROLOG: end: +; PROLOG: ret void +; PROLOG: } +; +; FIXME: Branch weights still need to be fixed in the case of prologues (issue +; #135812), so !0 and !1 do not yet match their comments below. When we do +; fix it, this test will hopefully catch any bug like issue #165998, which +; impacted the case of epilogues. +; +; Prologue loop guard: Prologue loop is always entered. +; Unrolled loop guard: Unrolled loop is always entered. +; PROLOG: !0 = !{!"branch_weights", i32 1, i32 127} +; +; Prologue loop latch: Prologue loop executes both of its 2 iterations. +; PROLOG: !1 = !{!"branch_weights", i32 0, i32 1} +; +; Unrolled loop latch: Unrolled loop is infinite. +; PROLOG: !4 = !{!"branch_weights", i32 1, i32 0} diff --git a/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll new file mode 100644 index 0000000000000..4d378b0d22f7d --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll @@ -0,0 +1,30 @@ +; Check that zeroed branch weights do not crash or otherwise break basic +; LoopUnroll behavior when it tries to compute a probability from them. + +; RUN: opt < %s -S -unroll-count=2 -passes='loop-unroll' 2>&1 | FileCheck %s + +define void @test() { +entry: + br label %loop + +loop: + br i1 false, label %end, label %loop, !prof !0 + +end: + ret void +} + +!0 = !{!"branch_weights", i32 0, i32 0} + +; CHECK: define void @test() { +; CHECK: entry: +; CHECK: br label %loop +; CHECK: loop: +; CHECK: br i1 false, label %end, label %loop.1, !prof !0 +; CHECK: loop.1: +; CHECK: br i1 false, label %end, label %loop, !prof !0, !llvm.loop !1 +; CHECK-NOT: loop.2 +; CHECK: end: +; CHECK: ret void +; CHECK: } +; CHECK: !0 = !{!"branch_weights", i32 0, i32 0} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 199203a9f5cb0..1164778c19070 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -82,7 +82,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 { ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> -; CHECK: Cost for VF 8: 27 +; CHECK: Cost for VF 8: 16 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll index 26e630f969ef3..0ee6b52a2450b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll @@ -55,44 +55,36 @@ define i32 @sudot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: entry: ; CHECK-NOI8MM-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-NOI8MM: vector.ph: -; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 -; CHECK-NOI8MM-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NOI8MM-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 -; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 -; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]] -; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]] -; CHECK-NOI8MM-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]] -; CHECK-NOI8MM-NEXT: [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] -; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]] +; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] +; CHECK-NOI8MM-NEXT: [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]] +; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]] -; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) -; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-NOI8MM: scalar.ph: +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] +; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-NOI8MM: for.exit: +; CHECK-NOI8MM-NEXT: ret i32 [[TMP15]] ; entry: br label %for.body @@ -166,44 +158,36 @@ define i32 @usdot(ptr %a, ptr %b) #0 { ; CHECK-NOI8MM-NEXT: entry: ; CHECK-NOI8MM-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-NOI8MM: vector.ph: -; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 -; CHECK-NOI8MM-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NOI8MM-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NOI8MM-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-NOI8MM: vector.body: ; CHECK-NOI8MM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NOI8MM-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NOI8MM-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3 -; CHECK-NOI8MM-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> +; CHECK-NOI8MM-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-NOI8MM-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NOI8MM-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3 -; CHECK-NOI8MM-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]] -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32> -; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]] -; CHECK-NOI8MM-NEXT: [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]] -; CHECK-NOI8MM-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]] -; CHECK-NOI8MM-NEXT: [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] -; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-NOI8MM-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NOI8MM-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1 +; CHECK-NOI8MM-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-NOI8MM-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]] +; CHECK-NOI8MM-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] +; CHECK-NOI8MM-NEXT: [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]] +; CHECK-NOI8MM-NEXT: [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]] +; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NOI8MM-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NOI8MM-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-NOI8MM: middle.block: -; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]] -; CHECK-NOI8MM-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]]) -; CHECK-NOI8MM-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NOI8MM-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-NOI8MM: scalar.ph: +; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] +; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-NOI8MM-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-NOI8MM: for.exit: +; CHECK-NOI8MM-NEXT: ret i32 [[TMP15]] ; entry: br label %for.body @@ -292,7 +276,7 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 { ; CHECK-NOI8MM-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-NOI8MM: middle.block: ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]] ; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) @@ -387,7 +371,7 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 { ; CHECK-NOI8MM-NEXT: [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]] ; CHECK-NOI8MM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NOI8MM-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NOI8MM-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-NOI8MM: middle.block: ; CHECK-NOI8MM-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]] ; CHECK-NOI8MM-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index b84763142b686..e74830700776c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -512,17 +512,23 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul <16 x i32> [[TMP12]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add <16 x i32> [[TMP13]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 4636c1b63da82..d77ca9875bf01 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -758,132 +758,87 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1) -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 -; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP16]], i32 [[TMP22]] -; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = mul nuw i32 [[TMP24]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = sub i32 [[TMP25]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = extractelement <vscale x 8 x i32> [[TMP18]], i32 [[TMP26]] -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP8]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP17]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP9]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add <16 x i32> [[TMP9]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 -; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP31]] -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nuw i32 [[TMP33]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP35]] -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = extractelement <16 x i32> [[TMP11]], i32 15 +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP13]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1 -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32> +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1 -; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32> -; CHECK-MAXBW-NEXT: [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]] -; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1) -; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8 -; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 -; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP22]] -; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = mul nuw i32 [[TMP24]], 8 -; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 -; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP31]] -; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-MAXBW: scalar.ph: +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15 +; CHECK-MAXBW-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-MAXBW: for.exit: +; CHECK-MAXBW-NEXT: ret i32 [[TMP8]] ; entry: br label %for.body @@ -930,7 +885,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 @@ -968,7 +923,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8 @@ -1000,7 +955,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8 @@ -1085,7 +1040,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) @@ -1183,7 +1138,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]] ; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) @@ -1253,7 +1208,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE13]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP26]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE13]]) ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE10]]) @@ -1350,7 +1305,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) ; CHECK-INTERLEAVE1-NEXT: br label [[EXIT:%.*]] @@ -1388,7 +1343,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]]) ; CHECK-INTERLEAVED-NEXT: br label [[EXIT:%.*]] @@ -1426,7 +1381,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = xor i1 [[TMP19]], true -; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: br label [[EXIT:%.*]] @@ -1461,82 +1416,66 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP8]], 4 -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP9]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <16 x i32> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i32> [[TMP12]], i32 [[TMP19]] -; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVE1: scalar.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 +; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVE1: for.exit: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add i32 [[TMP7]], [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8 -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP15]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP8]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <16 x i32> [[TMP15]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]] -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sub i32 [[TMP28]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP29]] -; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] -; CHECK-INTERLEAVED: scalar.ph: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15 +; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] +; CHECK-INTERLEAVED: for.exit: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add i32 [[TMP13]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1561,7 +1500,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]] ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]]) ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() @@ -1616,7 +1555,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] @@ -1651,7 +1590,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) @@ -1685,7 +1624,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] @@ -1803,7 +1742,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add i32 [[TMP21]], [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]] ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -1915,7 +1854,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -1953,7 +1892,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -1968,31 +1907,27 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] ; CHECK-MAXBW: for.ph: ; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0 -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64> -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP9]], [[BROADCAST_SPLAT]] -; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[BROADCAST_SPLAT]] +; CHECK-MAXBW-NEXT: [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]]) +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -2048,7 +1983,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]] ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -2086,7 +2021,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) @@ -2101,31 +2036,27 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]] ; CHECK-MAXBW: for.ph: ; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = zext i16 [[B]] to i64 -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 -; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 ; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0 -; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]] -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64> -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]] -; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-MAXBW: middle.block: -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]]) +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-MAXBW: scalar.ph: @@ -2186,7 +2117,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2232,7 +2163,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]] ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) @@ -2274,7 +2205,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP11]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]]) ; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] @@ -2396,7 +2327,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) @@ -2496,7 +2427,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) @@ -2596,7 +2527,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]]) ; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-MAXBW-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]]) ; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]]) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index 89819f2be4967..1cbec47d72203 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -17,18 +17,33 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-LABEL: @foo4( ; RV32-NEXT: entry: ; RV32-NEXT: br label [[VECTOR_MEMCHECK:%.*]] +; RV32: vector.scevcheck: +; RV32-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 128, i32 624) +; RV32-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 +; RV32-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 +; RV32-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[MUL_RESULT]] +; RV32-NEXT: [[TMP1:%.*]] = icmp ult ptr [[TMP0]], [[A]] +; RV32-NEXT: [[TMP2:%.*]] = or i1 [[TMP1]], [[MUL_OVERFLOW]] +; RV32-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 256, i32 624) +; RV32-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 +; RV32-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 +; RV32-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[MUL_RESULT2]] +; RV32-NEXT: [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[B]] +; RV32-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW3]] +; RV32-NEXT: [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]] +; RV32-NEXT: br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK1:%.*]] ; RV32: vector.memcheck: -; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880 ; RV32-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940 -; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752 -; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] -; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; RV32-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i32 79880 +; RV32-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i32 159752 +; RV32-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]] +; RV32-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]] ; RV32-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; RV32-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]] ; RV32-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]] ; RV32-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; RV32-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; RV32-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; RV32: vector.ph: ; RV32-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64() ; RV32-NEXT: [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16) @@ -43,25 +58,26 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0 ; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer ; RV32-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]] -; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; RV32-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100) ; RV32-NEXT: [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 1) ; RV32-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]] -; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]] +; RV32-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]] ; RV32-NEXT: [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double> ; RV32-NEXT: [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]] -; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] +; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3]], !noalias [[META5]] ; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; RV32-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]] ; RV32-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; RV32: middle.block: ; RV32-NEXT: br label [[FOR_END:%.*]] ; RV32: scalar.ph: +; RV32-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ] ; RV32-NEXT: br label [[FOR_BODY:%.*]] ; RV32: for.body: -; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; RV32-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; RV32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]] ; RV32-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; RV32-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100 @@ -78,7 +94,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32: for.inc: ; RV32-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV32-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] +; RV32-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] ; RV32: for.end: ; RV32-NEXT: ret void ; @@ -146,7 +162,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64: for.inc: ; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; RV64-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] +; RV64-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]] ; RV64: for.end: ; RV64-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll index e6b74062ad765..a33f8eb920039 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll @@ -35,7 +35,7 @@ target triple = "x86_64-unknown-linux-gnu" ; This test was originally vectorized, but now SCEV is smart enough to prove ; that its trip count is 1, so it gets ignored by vectorizer. ; Function Attrs: uwtable -define void @test_01(i1 %arg) { +define void @test_01(ptr addrspace(1) %p, i1 %arg) { br label %.outer ; <label>:1: ; preds = %2 @@ -57,8 +57,8 @@ define void @test_01(i1 %arg) { %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] %9 = add i32 %8, 2 %10 = zext i32 %9 to i64 - %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10 - %12 = ashr i32 undef, %4 + %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10 + %12 = ashr i32 12, %4 store i32 %12, ptr addrspace(1) %11, align 4 %13 = add i32 %7, 1 %14 = icmp sgt i32 %13, 61 @@ -74,7 +74,7 @@ define void @test_01(i1 %arg) { ; CHECK: store <4 x i32> ; Function Attrs: uwtable -define void @test_02(i1 %arg) { +define void @test_02(ptr addrspace(1) %p, i1 %arg) { br label %.outer ; <label>:1: ; preds = %2 @@ -96,8 +96,8 @@ define void @test_02(i1 %arg) { %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] %9 = add i32 %8, 2 %10 = zext i32 %9 to i64 - %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10 - %12 = ashr i32 undef, %4 + %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10 + %12 = ashr i32 12, %4 store i32 %12, ptr addrspace(1) %11, align 4 %13 = add i32 %7, 1 %14 = icmp sgt i32 %13, 610 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll index 4cff8753ba9b1..239366c59470e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll @@ -11,9 +11,9 @@ target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-w64-windows-gnu" -define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 { +define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p, ptr %pend) #0 { ; CHECK-LABEL: define void @cff_index_load_offsets( -; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]], ptr [[PEND:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[IF_THEN:.*]], label %[[EXIT:.*]] ; CHECK: [[IF_THEN]]: @@ -26,14 +26,14 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 { ; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16 ; CHECK-NEXT: [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]] -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr undef, align 1, !tbaa [[CHAR_TBAA1]] -; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 undef, 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA1]] +; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 12, 8 ; CHECK-NEXT: [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]] ; CHECK-NEXT: [[CONV81:%.*]] = zext i8 [[TMP1]] to i32 ; CHECK-NEXT: [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]] -; CHECK-NEXT: store i32 [[OR83]], ptr undef, align 4, !tbaa [[LONG_TBAA4:![0-9]+]] +; CHECK-NEXT: store i32 [[OR83]], ptr [[P]], align 4, !tbaa [[LONG_TBAA4:![0-9]+]] ; CHECK-NEXT: [[ADD_PTR86]] = getelementptr inbounds i8, ptr [[P_359]], i64 4 -; CHECK-NEXT: [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], undef +; CHECK-NEXT: [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], [[PEND]] ; CHECK-NEXT: br i1 [[CMP66]], label %[[FOR_BODY68]], label %[[SW_EPILOG:.*]] ; CHECK: [[SW_EPILOG]]: ; CHECK-NEXT: unreachable @@ -54,14 +54,14 @@ for.body68: ; preds = %for.body68, %if.the %conv73 = zext i8 %0 to i32 %shl74 = shl nuw nsw i32 %conv73, 16 %or75 = or i32 %shl74, %shl71 - %1 = load i8, ptr undef, align 1, !tbaa !1 - %shl78 = shl nuw nsw i32 undef, 8 + %1 = load i8, ptr %p, align 1, !tbaa !1 + %shl78 = shl nuw nsw i32 12, 8 %or79 = or i32 %or75, %shl78 %conv81 = zext i8 %1 to i32 %or83 = or i32 %or79, %conv81 - store i32 %or83, ptr undef, align 4, !tbaa !4 + store i32 %or83, ptr %p, align 4, !tbaa !4 %add.ptr86 = getelementptr inbounds i8, ptr %p.359, i64 4 - %cmp66 = icmp ult ptr %add.ptr86, undef + %cmp66 = icmp ult ptr %add.ptr86, %pend br i1 %cmp66, label %for.body68, label %sw.epilog sw.epilog: ; preds = %for.body68 diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll index e75d469506376..acec9e47a94ee 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll @@ -41,8 +41,8 @@ for.cond.cleanup: ; preds = %for.body ; Make sure interleave groups with a key being the special 'empty' value for ; the map do not cause a crash. -define void @test_gap_empty_key() { -; CHECK-LABEL: @test_gap_empty_key() +define void @test_gap_empty_key(ptr %p) { +; CHECK-LABEL: @test_gap_empty_key(ptr %p) ; CHECK-NEXT: entry: ; CHECK-NEXT: br label %for.body @@ -57,7 +57,7 @@ entry: for.body: %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ] %iv.next = add nsw i64 %iv, 1 - %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next + %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next %G9 = getelementptr i32, ptr %G2, i32 -2147483647 store i32 0, ptr %G2 @@ -71,8 +71,8 @@ exit: ; Make sure interleave groups with a key being the special 'tombstone' value for ; the map do not cause a crash. -define void @test_tombstone_key() { -; CHECK-LABEL: @test_tombstone_key() +define void @test_tombstone_key(ptr %p) { +; CHECK-LABEL: @test_tombstone_key(ptr %p) ; CHECK-NEXT: entry: ; CHECK-NEXT: br label %for.body @@ -87,7 +87,7 @@ entry: for.body: %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ] %iv.next = add nsw i64 %iv, 1 - %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next + %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next %G9 = getelementptr i32, ptr %G2, i32 -2147483648 store i32 0, ptr %G2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll index efcc0005acaa3..f9570405ecabc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -711,17 +711,92 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE16:%.*]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul i32 [[INDEX]], 3 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false> -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr align 1 [[TMP3]], <24 x i1> [[TMP5]], <24 x i8> poison) -; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nsw <8 x i32> [[VEC_IND]], splat (i32 3) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP51]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i32 [[TMP5]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = load i8, ptr [[TMP52]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP53]], i64 0 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; ENABLED_MASKED_STRIDED: pred.load.continue: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = phi <8 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP4]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if3: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP11]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i64 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; ENABLED_MASKED_STRIDED: pred.load.continue4: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP4]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if5: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP17]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i64 2 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; ENABLED_MASKED_STRIDED: pred.load.continue6: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP4]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if7: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP23]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i64 3 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; ENABLED_MASKED_STRIDED: pred.load.continue8: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP4]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if9: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP29]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i64 4 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; ENABLED_MASKED_STRIDED: pred.load.continue10: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP4]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if11: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP35]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i64 5 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; ENABLED_MASKED_STRIDED: pred.load.continue12: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP4]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; ENABLED_MASKED_STRIDED: pred.load.if13: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP41]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i64 6 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; ENABLED_MASKED_STRIDED: pred.load.continue14: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP4]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]] +; ENABLED_MASKED_STRIDED: pred.load.if15: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP47]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i64 7 +; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; ENABLED_MASKED_STRIDED: pred.load.continue16: +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP6]], <8 x i1> [[TMP4]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll index 41756ffb64e6c..8744e45344242 100644 --- a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll +++ b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll @@ -4,13 +4,13 @@ ; Only make sure we do not crash. ; CHECK: @test -define void @test(ptr %ptr, ptr %ptr_end) { +define void @test(i8 %v, ptr %ptr, ptr %ptr_end) { start: br label %loop loop: %ptr2 = phi ptr [ %ptr3, %loop ], [ %ptr, %start ] - %x = sext i8 undef to i64 + %x = sext i8 %v to i64 %ptr3 = getelementptr inbounds i8, ptr %ptr2, i64 1 %cmp = icmp ult ptr %ptr3, %ptr_end br i1 %cmp, label %loop, label %end diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll index c164c4a46bd94..e7913c583b938 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -384,15 +384,15 @@ for.inc26: ; conditional store to remain scalar. Since we can only type-shrink vector ; types, we shouldn't try to represent the expression in a smaller type. ; -define void @minimal_bit_widths(i1 %c) { +define void @minimal_bit_widths(ptr %p, i1 %c) { ; UNROLL-LABEL: @minimal_bit_widths( ; UNROLL-NEXT: entry: ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] ; UNROLL-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]] -; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]] +; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]] +; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]] ; UNROLL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1 ; UNROLL-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1 ; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]] @@ -415,8 +415,8 @@ define void @minimal_bit_widths(i1 %c) { ; UNROLL-NOSIMPLIFY: vector.body: ; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] @@ -442,16 +442,16 @@ define void @minimal_bit_widths(i1 %c) { ; VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC: vector.body: ; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] -; VEC-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]] +; VEC-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]] ; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP1]], align 1 ; VEC-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]] ; VEC: pred.store.if: ; VEC-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP8]] +; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]] ; VEC-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0 ; VEC-NEXT: store i8 [[TMP4]], ptr [[TMP3]], align 1 ; VEC-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 1 -; VEC-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr undef, i64 [[TMP5]] +; VEC-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP5]] ; VEC-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1 ; VEC-NEXT: store i8 [[TMP7]], ptr [[TMP6]], align 1 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE2]] @@ -468,7 +468,7 @@ entry: for.body: %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ] %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1000, %entry ] - %tmp2 = getelementptr i8, ptr undef, i64 %tmp0 + %tmp2 = getelementptr i8, ptr %p, i64 %tmp0 %tmp3 = load i8, ptr %tmp2, align 1 br i1 %c, label %if.then, label %for.inc diff --git a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll index 106054d989776..d87d9b155db1c 100644 --- a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -define void @test(i1 %arg) { +define void @test(ptr %p, i1 %arg) { entry: br i1 %arg, label %while.end, label %while.body.lr.ph @@ -11,7 +11,7 @@ while.body.lr.ph: br label %while.body while.body: - %it.sroa.0.091 = phi ptr [ undef, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ] + %it.sroa.0.091 = phi ptr [ %p, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ] %incdec.ptr.i = getelementptr inbounds i32, ptr %it.sroa.0.091, i64 1 %inc32 = add i32 undef, 1 ; <------------- Make sure we don't set NSW flags to the undef. %cmp.i11 = icmp eq ptr %incdec.ptr.i, undef diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll index 763072ab16f73..f9f7feb7bdfbc 100644 --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -248,25 +248,27 @@ for.end: ; preds = %for.body ; @cm_array = external global [2592 x i16], align 1 -define void @pr43371() optsize { +define void @pr43371(i16 %val) optsize { ; ; CHECK-LABEL: define void @pr43371( -; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; CHECK-NEXT: store i16 0, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 +; CHECK-NEXT: store i16 0, ptr [[TMP7]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -277,22 +279,24 @@ define void @pr43371() optsize { ; CHECK-NEXT: unreachable ; ; PGSO-LABEL: define void @pr43371( -; PGSO-SAME: ) #[[ATTR0]] { +; PGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] { ; PGSO-NEXT: [[ENTRY:.*:]] ; PGSO-NEXT: br label %[[VECTOR_PH:.*]] ; PGSO: [[VECTOR_PH]]: +; PGSO-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; PGSO-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; PGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; PGSO: [[VECTOR_BODY]]: ; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PGSO-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; PGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; PGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; PGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 +; PGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 +; PGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 ; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; PGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; PGSO-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -303,22 +307,24 @@ define void @pr43371() optsize { ; PGSO-NEXT: unreachable ; ; NPGSO-LABEL: define void @pr43371( -; NPGSO-SAME: ) #[[ATTR0]] { +; NPGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] { ; NPGSO-NEXT: [[ENTRY:.*:]] ; NPGSO-NEXT: br label %[[VECTOR_PH:.*]] ; NPGSO: [[VECTOR_PH]]: +; NPGSO-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; NPGSO-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; NPGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; NPGSO: [[VECTOR_BODY]]: ; NPGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; NPGSO-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; NPGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; NPGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; NPGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; NPGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; NPGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; NPGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; NPGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; NPGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; NPGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; NPGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 +; NPGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; NPGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 +; NPGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 ; NPGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; NPGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; NPGSO-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -340,7 +346,7 @@ for.cond.cleanup28: for.body29: %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] - %add33 = add i16 undef, %i24.0170 + %add33 = add i16 %val, %i24.0170 %idxprom34 = zext i16 %add33 to i32 %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34 store i16 0, ptr %arrayidx35, align 1 @@ -349,25 +355,27 @@ for.body29: br i1 %cmp26, label %for.body29, label %for.cond.cleanup28 } -define void @pr43371_pgso() !prof !14 { +define void @pr43371_pgso(i16 %val) !prof !14 { ; ; CHECK-LABEL: define void @pr43371_pgso( -; CHECK-SAME: ) !prof [[PROF14]] { +; CHECK-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; CHECK-NEXT: store i16 0, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; CHECK-NEXT: store i16 0, ptr [[TMP5]], align 1 +; CHECK-NEXT: store i16 0, ptr [[TMP7]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -378,22 +386,24 @@ define void @pr43371_pgso() !prof !14 { ; CHECK-NEXT: unreachable ; ; PGSO-LABEL: define void @pr43371_pgso( -; PGSO-SAME: ) !prof [[PROF14]] { +; PGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] { ; PGSO-NEXT: [[ENTRY:.*:]] ; PGSO-NEXT: br label %[[VECTOR_PH:.*]] ; PGSO: [[VECTOR_PH]]: +; PGSO-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0 +; PGSO-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; PGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; PGSO: [[VECTOR_BODY]]: ; PGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PGSO-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]] +; PGSO-NEXT: [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]] ; PGSO-NEXT: [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32> -; PGSO-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; PGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] +; PGSO-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; PGSO-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; PGSO-NEXT: [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]] -; PGSO-NEXT: store i16 0, ptr [[TMP3]], align 1 +; PGSO-NEXT: [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]] ; PGSO-NEXT: store i16 0, ptr [[TMP5]], align 1 +; PGSO-NEXT: store i16 0, ptr [[TMP7]], align 1 ; PGSO-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; PGSO-NEXT: [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2) ; PGSO-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756 @@ -404,17 +414,19 @@ define void @pr43371_pgso() !prof !14 { ; PGSO-NEXT: unreachable ; ; NPGSO-LABEL: define void @pr43371_pgso( -; NPGSO-SAME: ) !prof [[PROF14]] { +; NPGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] { ; NPGSO-NEXT: [[ENTRY:.*:]] ; NPGSO-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] ; NPGSO: [[VECTOR_SCEVCHECK]]: -; NPGSO-NEXT: br i1 undef, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NPGSO-NEXT: [[TMP0:%.*]] = add i16 [[VAL]], 755 +; NPGSO-NEXT: [[TMP4:%.*]] = icmp ult i16 [[TMP0]], [[VAL]] +; NPGSO-NEXT: br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; NPGSO: [[VECTOR_PH]]: ; NPGSO-NEXT: br label %[[VECTOR_BODY:.*]] ; NPGSO: [[VECTOR_BODY]]: ; NPGSO-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; NPGSO-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; NPGSO-NEXT: [[TMP1:%.*]] = add i16 undef, [[OFFSET_IDX]] +; NPGSO-NEXT: [[TMP1:%.*]] = add i16 [[VAL]], [[OFFSET_IDX]] ; NPGSO-NEXT: [[TMP2:%.*]] = zext i16 [[TMP1]] to i32 ; NPGSO-NEXT: [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]] ; NPGSO-NEXT: store <2 x i16> zeroinitializer, ptr [[TMP3]], align 1 @@ -429,7 +441,7 @@ define void @pr43371_pgso() !prof !14 { ; NPGSO-NEXT: unreachable ; NPGSO: [[FOR_BODY29]]: ; NPGSO-NEXT: [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] -; NPGSO-NEXT: [[ADD33:%.*]] = add i16 undef, [[I24_0170]] +; NPGSO-NEXT: [[ADD33:%.*]] = add i16 [[VAL]], [[I24_0170]] ; NPGSO-NEXT: [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32 ; NPGSO-NEXT: [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]] ; NPGSO-NEXT: store i16 0, ptr [[ARRAYIDX35]], align 1 @@ -449,7 +461,7 @@ for.cond.cleanup28: for.body29: %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29] - %add33 = add i16 undef, %i24.0170 + %add33 = add i16 %val, %i24.0170 %idxprom34 = zext i16 %add33 to i32 %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34 store i16 0, ptr %arrayidx35, align 1 diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll index 2d30e0c9ad10f..f65e9cab1700b 100644 --- a/llvm/test/Transforms/LoopVectorize/pr32859.ll +++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll @@ -10,13 +10,13 @@ ; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ] ; Function Attrs: nounwind uwtable -define void @main(i32 %n) #0 { +define void @main(i32 %n, i32 %v) #0 { entry: br label %for.cond1.preheader.i for.cond1.preheader.i: ; preds = %if.end.2.i, %entry %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ] - %tobool.i = icmp ne i32 undef, 0 + %tobool.i = icmp ne i32 %v, 0 br label %if.end.2.i if.end.2.i: ; preds = %for.cond1.preheader.i diff --git a/llvm/test/Transforms/LoopVectorize/pr36311.ll b/llvm/test/Transforms/LoopVectorize/pr36311.ll index f2dfecc341e6f..bc27e4e9b09cd 100644 --- a/llvm/test/Transforms/LoopVectorize/pr36311.ll +++ b/llvm/test/Transforms/LoopVectorize/pr36311.ll @@ -10,10 +10,7 @@ $test = comdat any -declare i32 @__gxx_personality_v0(...) - -; Function Attrs: uwtable -define dso_local void @test(i1 %arg) local_unnamed_addr #0 comdat align 2 personality ptr @__gxx_personality_v0 { +define void @test(ptr %p, i1 %arg) { entry: br label %for.body51 @@ -26,9 +23,9 @@ for.cond80.loopexit: ; preds = %for.body89 for.body89.lr.ph: ; preds = %for.cond80.loopexit, %for.body51 %i79.0179 = phi i32 [ %add90, %for.cond80.loopexit ], [ 0, %for.body51 ] - %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ undef, %for.body51 ] + %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ 0, %for.body51 ] %add90 = add nuw i32 %i79.0179, 1 - %mul91 = mul i32 %add90, undef + %mul91 = mul i32 %add90, 7 br label %for.body89 for.body89: ; preds = %for.body89, %for.body89.lr.ph @@ -38,10 +35,10 @@ for.body89: ; preds = %for.body89, %for.bo %add93 = add i32 %add92, %mul91 %inc94 = add i32 %next_index.5174, 1 %conv95 = zext i32 %next_index.5174 to i64 - %arrayidx.i160 = getelementptr inbounds i32, ptr undef, i64 %conv95 + %arrayidx.i160 = getelementptr inbounds i32, ptr %p, i64 %conv95 store i32 %add93, ptr %arrayidx.i160, align 4 ;, !tbaa !1 - %cmp87 = icmp ult i32 %add92, undef + %cmp87 = icmp ult i32 %add92, 123 br i1 %cmp87, label %for.body89, label %for.cond80.loopexit nrvo.skipdtor.loopexit: ; preds = %for.cond80.loopexit diff --git a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll index 0656cd2b2aa94..0fdc8fd6ad519 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll @@ -15,7 +15,7 @@ define void @PR49215(ptr %p, ptr %q) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult ptr [[Q:%.*]], [[G]] ; CHECK-NEXT: [[UMIN]] = select i1 [[CMP2]], ptr [[Q]], ptr [[G]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], undef +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 123 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]] ; CHECK: loopexit: ; CHECK-NEXT: [[UMIN_LCSSA:%.*]] = phi ptr [ [[UMIN]], [[FOR_BODY]] ] @@ -31,7 +31,7 @@ for.body: %cmp2 = icmp ult ptr %q, %g %umin = select i1 %cmp2, ptr %q, ptr %g %iv.next = add nuw nsw i64 %iv, 1 - %exitcond = icmp eq i64 %iv.next, undef + %exitcond = icmp eq i64 %iv.next, 123 br i1 %exitcond, label %loopexit, label %for.body loopexit: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll index 13cc1b657d231..f01e562fe40c7 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll @@ -3,7 +3,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -define i8 @PR34687(i1 %c, i32 %x, i32 %n) { +define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) { ; CHECK-LABEL: @PR34687( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4 @@ -13,20 +13,30 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[X1:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[VEC_IND]], [[TMP0]] +; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255) -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT4]] ; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8> ; CHECK-NEXT: [[TMP4]] = zext <4 x i8> [[TMP3]] to <4 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]]) ; CHECK-NEXT: [[TMP8:%.*]] = zext i8 [[TMP7]] to i32 +; CHECK-NEXT: [[PREDPHI:%.*]] = extractelement <4 x i32> [[PREDPHI1]], i32 3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -36,17 +46,19 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) { ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ] ; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[R_NEXT:%.*]], [[IF_END]] ] -; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END]] +; CHECK-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]] ; CHECK: if.then: -; CHECK-NEXT: [[T0:%.*]] = sdiv i32 undef, undef +; CHECK-NEXT: [[T0:%.*]] = sdiv i32 [[I]], [[X]] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: +; CHECK-NEXT: [[DIV_PHI:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[T0]], [[IF_THEN]] ] ; CHECK-NEXT: [[T1:%.*]] = and i32 [[R]], 255 ; CHECK-NEXT: [[I_NEXT]] = add nsw i32 [[I]], 1 -; CHECK-NEXT: [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X]] +; CHECK-NEXT: [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X1]] ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[I_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: +; CHECK-NEXT: [[DIV_USE:%.*]] = phi i32 [ [[DIV_PHI]], [[IF_END]] ], [ [[PREDPHI]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[T2:%.*]] = phi i32 [ [[R_NEXT]], [[IF_END]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[T3:%.*]] = trunc i32 [[T2]] to i8 ; CHECK-NEXT: ret i8 [[T3]] @@ -60,10 +72,11 @@ for.body: br i1 %c, label %if.then, label %if.end if.then: - %t0 = sdiv i32 undef, undef + %t0 = sdiv i32 %i, %divisor br label %if.end if.end: + %div_phi = phi i32 [ 0, %for.body ], [ %t0, %if.then ] %t1 = and i32 %r, 255 %i.next = add nsw i32 %i, 1 %r.next = add nuw nsw i32 %t1, %x @@ -71,6 +84,7 @@ if.end: br i1 %cond, label %for.end, label %for.body for.end: + %div_use = phi i32 [ %div_phi, %if.end ] %t2 = phi i32 [ %r.next, %if.end ] %t3 = trunc i32 %t2 to i8 ret i8 %t3 diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll index c76c2c0ef47a2..ab10d62bc6048 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll @@ -12,12 +12,12 @@ entry: loop: %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ] - %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr, i64 undef + %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr store i64 0, ptr %tmp4, align 8 %tmp8 = add i64 1, %tmp3 %tmp10 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp8 store i64 1, ptr %tmp10, align 8 - %tmp14 = add i64 undef, %tmp3 + %tmp14 = add i64 3, %tmp3 %tmp16 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp14 store i64 2, ptr %tmp16, align 8 %tmp18 = add nuw nsw i64 %tmp3, 4 diff --git a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll index 1fccf546f4a67..d3cd80beaae90 100644 --- a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll +++ b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll @@ -14,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK-LABEL: @t( ; CHECK: <4 x i32> -define void @t() { +define void @t(ptr %p) { entry: br label %for.body @@ -22,13 +22,13 @@ for.body: %indvars.iv17 = phi i64 [ %indvars.next, %for.body ], [ 128, %entry ] ; Loop invariant anchored in loop. - %idxprom21 = zext i32 undef to i64 + %idxprom21 = zext i32 0 to i64 - %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr undef, i64 0, i64 %idxprom21, i64 %indvars.iv17 - store i32 undef, ptr %arrayidx23, align 4 + %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr %p, i64 0, i64 %idxprom21, i64 %indvars.iv17 + store i32 poison, ptr %arrayidx23, align 4 %indvars.next= add i64 %indvars.iv17, -1 %0 = trunc i64 %indvars.next to i32 - %cmp15 = icmp ugt i32 %0, undef + %cmp15 = icmp ugt i32 %0, poison br i1 %cmp15, label %for.body, label %loopexit loopexit: diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll index 67970c41f765e..0b6c4f32772f5 100644 --- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll +++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll @@ -385,7 +385,7 @@ define internal void @.omp_outlined..4(ptr noalias %.global_tid., ptr noalias %. ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4 ; CHECK-SAME: (ptr noalias nofree noundef nonnull readonly align 4 captures(none) dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1:![0-9]+]] ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]]) ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]] @@ -458,7 +458,7 @@ define internal void @.omp_outlined..5(ptr noalias %.global_tid., ptr noalias %. ; CHECK-SAME: (ptr noalias nofree readonly captures(none) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr noundef nonnull @[[GLOB0]]) #[[ATTR19]] -; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]] ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]]) ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]] @@ -534,7 +534,7 @@ define internal void @.omp_outlined..6(ptr noalias %.global_tid., ptr noalias %. ; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr noundef nonnull align 4 [[A1]]) #[[ATTR20:[0-9]+]] ; CHECK-NEXT: store i32 1, ptr [[A1]], align 4 ; CHECK-NEXT: store ptr [[A1]], ptr [[DOTOMP_REDUCTION_RED_LIST]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]] ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var) ; CHECK-NEXT: switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ ; CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] @@ -646,10 +646,10 @@ define internal void @.omp.reduction.reduction_func(ptr %arg, ptr %arg1) { ; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func ; CHECK-SAME: (ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG:%.*]], ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG1:%.*]]) #[[ATTR10:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8, !invariant.load [[META1]] +; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8, !invariant.load [[META1]] +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !invariant.load [[META1]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4, !invariant.load [[META1]] ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[TMP4]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll index 55adda7d5b0f3..08191c636bd3f 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll @@ -18,45 +18,45 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT19]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX1]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP23]], align 4 +; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI18:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double> -; CHECK-NEXT: [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD18]] to <4 x double> +; CHECK-NEXT: [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD19]] to <4 x double> ; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT20]] -; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT20]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT15]] +; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT15]] ; CHECK-NEXT: [[TMP8:%.*]] = fcmp fast ogt <4 x double> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = fcmp fast ogt <4 x double> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP6]] ; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <4 x double> [[TMP7]], [[TMP7]] ; CHECK-NEXT: [[TMP12:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP6]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP13:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP7]], <4 x double> splat (double -0.000000e+00) -; CHECK-NEXT: [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP12]] -; CHECK-NEXT: [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP13]] +; CHECK-NEXT: [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP12]] +; CHECK-NEXT: [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI18]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP10]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP17:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP11]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP18]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP16]] -; CHECK-NEXT: [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI15]], [[TMP17]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8 +; CHECK-NEXT: [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP17]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]] ; CHECK-NEXT: [[TMP21:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]]) -; CHECK-NEXT: [[BIN_RDX21:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]] -; CHECK-NEXT: [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX21]]) +; CHECK-NEXT: [[BIN_RDX20:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX20]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY_PREHEADER22]] ; CHECK: [[FOR_BODY_PREHEADER22]]: @@ -65,11 +65,11 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef ; CHECK-NEXT: [[V0_010_PH:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[TMP22]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ] +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ] ; CHECK-NEXT: [[V1_012:%.*]] = phi double [ [[V1_2:%.*]], %[[FOR_BODY]] ], [ [[V1_011_PH]], %[[FOR_BODY_PREHEADER22]] ] ; CHECK-NEXT: [[V0_011:%.*]] = phi double [ [[V0_2:%.*]], %[[FOR_BODY]] ], [ [[V0_010_PH]], %[[FOR_BODY_PREHEADER22]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[CONV:%.*]] = fpext float [[TMP0]] to double ; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[Y]], [[CONV]] ; CHECK-NEXT: [[SUB:%.*]] = fsub fast double [[MUL]], [[Z]] @@ -79,16 +79,16 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef ; CHECK-NEXT: [[V0_2]] = fadd reassoc arcp contract afn double [[V0_011]], [[ADD8]] ; CHECK-NEXT: [[ADD4:%.*]] = select ninf i1 [[CMP1]], double [[MUL3]], double -0.000000e+00 ; CHECK-NEXT: [[V1_2]] = fadd reassoc arcp contract afn double [[V1_012]], [[ADD4]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[FOR_END_LOOPEXIT]]: -; CHECK-NEXT: [[V0_1:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[V1_1:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = fadd fast double [[V1_1]], [[V0_1]] +; CHECK-NEXT: [[V0_1_LCSSA:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[V1_1_LCSSA:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[TMP24:%.*]] = fadd fast double [[V1_1_LCSSA]], [[V0_1_LCSSA]] ; CHECK-NEXT: br label %[[FOR_END]] ; CHECK: [[FOR_END]]: -; CHECK-NEXT: [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP24]], %[[FOR_END_LOOPEXIT]] ] ; CHECK-NEXT: ret double [[ADD5]] ; entry: @@ -193,29 +193,29 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT35]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT29]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br label %[[FOR_BODY_US:.*]] ; CHECK: [[FOR_BODY_US]]: -; CHECK-NEXT: [[V1_021_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] -; CHECK-NEXT: [[V0_020_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] +; CHECK-NEXT: [[V1_019_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] +; CHECK-NEXT: [[V0_018_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ] ; CHECK-NEXT: [[BLOCK_017_US:%.*]] = phi i32 [ [[INC9_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0, %[[FOR_BODY_US_PREHEADER]] ] ; CHECK-NEXT: tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]]) ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY3_US_PREHEADER:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_021_US]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_020_US]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_019_US]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_018_US]], i64 0 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI31:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI32:%.*]] = phi <4 x double> [ [[TMP27]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI33:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US1]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US1]], align 4 +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US]], align 4 ; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double> ; CHECK-NEXT: [[TMP5:%.*]] = fpext <4 x float> [[WIDE_LOAD34]] to <4 x double> @@ -223,8 +223,8 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[TMP7:%.*]] = tail call fast <4 x double> @llvm.exp2.v4f64(<4 x double> [[TMP5]]) ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x double> [[TMP6]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <4 x double> [[TMP7]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT36]] -; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT36]] +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT30]] +; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT30]] ; CHECK-NEXT: [[TMP12:%.*]] = fcmp fast ogt <4 x double> [[TMP10]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast ogt <4 x double> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x double> [[TMP10]], [[TMP10]] @@ -237,26 +237,26 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[TMP21:%.*]] = select ninf <4 x i1> [[TMP13]], <4 x double> [[TMP15]], <4 x double> splat (double -0.000000e+00) ; CHECK-NEXT: [[TMP22]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP20]] ; CHECK-NEXT: [[TMP23]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI31]], [[TMP21]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP23]], [[TMP22]] ; CHECK-NEXT: [[TMP25:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]]) -; CHECK-NEXT: [[BIN_RDX37:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]] -; CHECK-NEXT: [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX37]]) +; CHECK-NEXT: [[BIN_RDX35:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]] +; CHECK-NEXT: [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX35]]) ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US_PREHEADER]] ; CHECK: [[FOR_BODY3_US_PREHEADER]]: ; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[V1_114_US_PH:%.*]] = phi double [ [[V1_021_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[V0_113_US_PH:%.*]] = phi double [ [[V0_020_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[V1_114_US_PH:%.*]] = phi double [ [[V1_019_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[V0_113_US_PH:%.*]] = phi double [ [[V0_018_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label %[[FOR_BODY3_US:.*]] ; CHECK: [[FOR_BODY3_US]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ] +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ] ; CHECK-NEXT: [[V1_116_US:%.*]] = phi double [ [[V1_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V1_114_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ] ; CHECK-NEXT: [[V0_115_US:%.*]] = phi double [ [[V0_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V0_113_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US1]], align 4 ; CHECK-NEXT: [[CONV_US:%.*]] = fpext float [[TMP0]] to double ; CHECK-NEXT: [[TMP1:%.*]] = tail call fast double @llvm.exp2.f64(double [[CONV_US]]) ; CHECK-NEXT: [[MUL_US:%.*]] = fmul fast double [[TMP1]], [[Y]] @@ -267,7 +267,7 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[V0_2_US]] = fadd reassoc arcp contract afn double [[V0_115_US]], [[ADD12_US]] ; CHECK-NEXT: [[ADD7_US1:%.*]] = select ninf i1 [[CMP4_US]], double [[ADD7_US]], double -0.000000e+00 ; CHECK-NEXT: [[V1_2_US]] = fadd reassoc arcp contract afn double [[V1_116_US]], [[ADD7_US1]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[EXITCOND25_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND25_NOT]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[FOR_COND1_FOR_INC8_CRIT_EDGE_US]]: @@ -275,17 +275,18 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R ; CHECK-NEXT: [[V1_2_US_LCSSA]] = phi double [ [[TMP25]], %[[MIDDLE_BLOCK]] ], [ [[V1_2_US]], %[[FOR_BODY3_US]] ] ; CHECK-NEXT: [[INC9_US]] = add nuw nsw i32 [[BLOCK_017_US]], 1 ; CHECK-NEXT: [[EXITCOND26_NOT:%.*]] = icmp eq i32 [[INC9_US]], [[NBLOCKS]] -; CHECK-NEXT: br i1 [[EXITCOND26_NOT]], label %[[FOR_END10]], label %[[FOR_BODY_US]] +; CHECK-NEXT: br i1 [[EXITCOND26_NOT]], label %[[FOR_END10_LOOPEXIT:.*]], label %[[FOR_BODY_US]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[BLOCK_017:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ] ; CHECK-NEXT: tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]]) ; CHECK-NEXT: [[INC9]] = add nuw nsw i32 [[BLOCK_017]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[NBLOCKS]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END10]], label %[[FOR_BODY]] +; CHECK: [[FOR_END10_LOOPEXIT]]: +; CHECK-NEXT: [[TMP29:%.*]] = fadd fast double [[V1_2_US_LCSSA]], [[V0_2_US_LCSSA]] +; CHECK-NEXT: br label %[[FOR_END10]] ; CHECK: [[FOR_END10]]: -; CHECK-NEXT: [[V0_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V0_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ] -; CHECK-NEXT: [[V1_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V1_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ] -; CHECK-NEXT: [[ADD11:%.*]] = fadd fast double [[V1_0_LCSSA]], [[V0_0_LCSSA]] +; CHECK-NEXT: [[ADD11:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29]], %[[FOR_END10_LOOPEXIT]] ], [ 0.000000e+00, %[[FOR_BODY]] ] ; CHECK-NEXT: ret double [[ADD11]] ; entry: diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll index 09f583f9242d5..3416584729317 100644 --- a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll +++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll @@ -38,5 +38,5 @@ define <4 x float> @fixed_vec_exp(<4 x float> %input) { declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0 declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>) #0 -; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll index c1a87f0c5f907..577efcbbac012 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll @@ -930,7 +930,7 @@ entry: ; COST-LABEL: Function: mla_v8i8_i32 -; COST: Cost: '-18' +; COST: Cost: '-24' define i32 @mla_v8i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" { ; CHECK-LABEL: @mla_v8i8_i32( ; CHECK-NEXT: entry: @@ -1009,7 +1009,7 @@ entry: ; COST-LABEL: Function: mla_v16i8_i32 -; COST: Cost: '-40' +; COST: Cost: '-52' define i32 @mla_v16i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" { ; CHECK-LABEL: @mla_v16i8_i32( ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll new file mode 100644 index 0000000000000..590b0be973002 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S --mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +@a = common global [100 x i64] zeroinitializer, align 64 + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[TMP0]], splat (i64 1) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 1) +; CHECK-NEXT: br i1 false, label %[[LOP_RHSCNT_I_PEEL:.*]], label %[[LAND_END_I_PEEL:.*]] +; CHECK: [[LOP_RHSCNT_I_PEEL]]: +; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i64> [[TMP1]], <i64 1, i64 0> +; CHECK-NEXT: br label %[[LAND_END_I_PEEL]] +; CHECK: [[LAND_END_I_PEEL]]: +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP4]], %[[LOP_RHSCNT_I_PEEL]] ] +; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 +; CHECK-NEXT: ret void +; +entry: + %.promoted104.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8 + %.promoted103.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 + %0 = add i64 %.promoted104.i, 1 + %1 = add i64 %.promoted103.i, 1 + %2 = add i64 %0, 1 + br i1 false, label %lop.rhscnt.i.peel, label %land.end.i.peel + +lop.rhscnt.i.peel: + %3 = or i64 %1, 1 + br label %land.end.i.peel + +land.end.i.peel: + %4 = phi i64 [ %2, %entry ], [ %0, %lop.rhscnt.i.peel ] + %5 = phi i64 [ %1, %entry ], [ %3, %lop.rhscnt.i.peel ] + store i64 %5, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8 + store i64 %4, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8 + ret void +} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll index 533b1f691f5ad..42b32e769d8d7 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll @@ -1,26 +1,35 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -S < %s | FileCheck %s ; RUN: opt -passes='simple-loop-unswitch<nontrivial>' -simple-loop-unswitch-guards -S < %s | FileCheck %s ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -verify-memoryssa -verify-loop-info -S < %s | FileCheck %s declare void @llvm.experimental.guard(i1, ...) -define void @test_simple_case(i1 %cond, i32 %N) { -; CHECK-LABEL: @test_simple_case( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: entry.split.us: -; CHECK-NEXT: br label [[LOOP_US:%.*]] -; CHECK: loop.us: -; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US]] -; CHECK: guarded.us: +define void @test_simple_case(i1 %cond, i32 %N) !prof !0 { +; CHECK-LABEL: define void @test_simple_case( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) !prof [[PROF0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US]] +; CHECK: [[GUARDED_US]]: ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]] -; CHECK: deopt: +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: @@ -38,29 +47,44 @@ exit: } define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) { -; CHECK-LABEL: @test_two_guards( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: entry.split.us: -; CHECK-NEXT: br i1 [[COND2:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]] -; CHECK: entry.split.us.split.us: -; CHECK-NEXT: br label [[LOOP_US_US:%.*]] -; CHECK: loop.us.us: -; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], [[GUARDED_US2:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US_US:%.*]] -; CHECK: guarded.us.us: -; CHECK-NEXT: br label [[GUARDED_US2]] -; CHECK: guarded.us2: +; CHECK-LABEL: define void @test_two_guards( +; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label %[[ENTRY_SPLIT_US_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[ENTRY_SPLIT_US_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US_US:.*]] +; CHECK: [[LOOP_US_US]]: +; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US2:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US_US:.*]] +; CHECK: [[GUARDED_US_US]]: +; CHECK-NEXT: br label %[[GUARDED_US2]] +; CHECK: [[GUARDED_US2]]: ; CHECK-NEXT: [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1 -; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label [[LOOP_US_US]], label [[EXIT_SPLIT_US_SPLIT_US:%.*]] -; CHECK: deopt1: +; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[EXIT_SPLIT_US_SPLIT_US:.*]] +; CHECK: [[EXIT_SPLIT_US_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[ENTRY_SPLIT_US_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: br label %[[GUARDED_US:.*]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: br label %[[DEOPT1:.*]] +; CHECK: [[DEOPT1]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: deopt: +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: exit: +; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -80,35 +104,46 @@ exit: } define void @test_conditional_guards(i1 %cond, i32 %N) { -; CHECK-LABEL: @test_conditional_guards( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[FROZEN:%.+]] = freeze i1 [[COND:%.*]] -; CHECK-NEXT: br i1 [[FROZEN]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: entry.split.us: -; CHECK-NEXT: br label [[LOOP_US:%.*]] -; CHECK: loop.us: -; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[BACKEDGE_US:%.*]] ] +; CHECK-LABEL: define void @test_conditional_guards( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[COND]] +; CHECK-NEXT: br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[BACKEDGE_US:.*]] ] ; CHECK-NEXT: [[CONDITION_US:%.*]] = icmp eq i32 [[IV_US]], 123 -; CHECK-NEXT: br i1 [[CONDITION_US]], label [[GUARD_US:%.*]], label [[BACKEDGE_US]] -; CHECK: guard.us: -; CHECK-NEXT: br label [[GUARDED_US:%.*]] -; CHECK: backedge.us: +; CHECK-NEXT: br i1 [[CONDITION_US]], label %[[GUARD_US:.*]], label %[[BACKEDGE_US]] +; CHECK: [[GUARD_US]]: +; CHECK-NEXT: br label %[[GUARDED_US:.*]] +; CHECK: [[BACKEDGE_US]]: ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: br label %[[BACKEDGE_US]] +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], %[[BACKEDGE:.*]] ] ; CHECK-NEXT: [[CONDITION:%.*]] = icmp eq i32 [[IV]], 123 -; CHECK-NEXT: br i1 [[CONDITION]], label [[GUARD:%.*]], label [[BACKEDGE]] -; CHECK: guard: -; CHECK-NEXT: br label [[DEOPT:%.*]] -; CHECK: deopt: +; CHECK-NEXT: br i1 [[CONDITION]], label %[[GUARD:.*]], label %[[BACKEDGE]] +; CHECK: [[GUARD]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: backedge: +; CHECK: [[BACKEDGE]]: ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label %loop, label [[EXIT_SPLIT:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT_SPLIT:.*]] +; CHECK: [[EXIT_SPLIT]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: @@ -133,53 +168,54 @@ exit: } define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) { -; CHECK-LABEL: define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 %cond, label %entry.split, label %outer_loop.split -; CHECK: entry.split: -; CHECK-NEXT: br i1 %arg, label %entry.split.split.us, label %entry.split.split -; CHECK: entry.split.split.us: -; CHECK-NEXT: br label %outer_loop.us -; CHECK: outer_loop.us: -; CHECK-NEXT: br label %outer_loop.split.us.us -; CHECK: outer_backedge.us: -; CHECK-NEXT: br label %outer_loop.us -; CHECK: outer_loop.split.us.us: -; CHECK-NEXT: br label %loop.us.us -; CHECK: loop.us.us: -; CHECK-NEXT: %iv.us.us = phi i32 [ 0, %outer_loop.split.us.us ], [ %iv.next.us.us, %guarded.us.us ] -; CHECK-NEXT: br label %guarded.us.us -; CHECK: guarded.us.us: -; CHECK-NEXT: %iv.next.us.us = add i32 %iv.us.us, 1 -; CHECK-NEXT: %loop.cond.us.us = icmp slt i32 %iv.next.us.us, %N -; CHECK-NEXT: br i1 %loop.cond.us.us, label %loop.us.us, label %outer_backedge.split.us.us -; CHECK: outer_backedge.split.us.us: -; CHECK-NEXT: br label %outer_backedge.us -; CHECK: entry.split.split: -; CHECK-NEXT: br label %outer_loop -; CHECK: outer_loop: -; CHECK-NEXT: br label %outer_loop.split.us -; CHECK: outer_loop.split.us: -; CHECK-NEXT: br label %loop.us -; CHECK: loop.us: -; CHECK-NEXT: %iv.us = phi i32 [ 0, %outer_loop.split.us ], [ %iv.next.us, %guarded.us ] -; CHECK-NEXT: br label %guarded.us -; CHECK: guarded.us: -; CHECK-NEXT: %iv.next.us = add i32 %iv.us, 1 -; CHECK-NEXT: %loop.cond.us = icmp slt i32 %iv.next.us, %N -; CHECK-NEXT: br i1 %loop.cond.us, label %loop.us, label %outer_backedge.split.us -; CHECK: outer_backedge.split.us: -; CHECK-NEXT: br label %outer_backedge -; CHECK: outer_loop.split: -; CHECK-NEXT: br label %loop -; CHECK: loop: -; CHECK-NEXT: br label %deopt -; CHECK: deopt: +; CHECK-LABEL: define void @test_nested_loop( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]], i1 [[ARG:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND]], label %[[ENTRY_SPLIT:.*]], label %[[OUTER_LOOP_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br i1 [[ARG]], label %[[ENTRY_SPLIT_SPLIT_US:.*]], label %[[ENTRY_SPLIT_SPLIT:.*]] +; CHECK: [[ENTRY_SPLIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_US:.*]] +; CHECK: [[OUTER_LOOP_US]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_SPLIT_US_US:.*]] +; CHECK: [[OUTER_BACKEDGE_US:.*]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_US]] +; CHECK: [[OUTER_LOOP_SPLIT_US_US]]: +; CHECK-NEXT: br label %[[LOOP_US_US:.*]] +; CHECK: [[LOOP_US_US]]: +; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US_US]] +; CHECK: [[GUARDED_US_US]]: +; CHECK-NEXT: [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[OUTER_BACKEDGE_SPLIT_US_US:.*]] +; CHECK: [[OUTER_BACKEDGE_SPLIT_US_US]]: +; CHECK-NEXT: br label %[[OUTER_BACKEDGE_US]] +; CHECK: [[ENTRY_SPLIT_SPLIT]]: +; CHECK-NEXT: br label %[[OUTER_LOOP:.*]] +; CHECK: [[OUTER_LOOP]]: +; CHECK-NEXT: br label %[[OUTER_LOOP_SPLIT_US:.*]] +; CHECK: [[OUTER_LOOP_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP_US:.*]] +; CHECK: [[LOOP_US]]: +; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[OUTER_BACKEDGE_SPLIT_US:.*]] +; CHECK: [[OUTER_BACKEDGE_SPLIT_US]]: +; CHECK-NEXT: br label %[[OUTER_BACKEDGE:.*]] +; CHECK: [[OUTER_LOOP_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: outer_backedge: -; CHECK-NEXT: br label %exit -; CHECK: exit: +; CHECK: [[OUTER_BACKEDGE]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -204,17 +240,50 @@ exit: } define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) { -; CHECK-LABEL: @test_sibling_loops( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] -; CHECK: [[IV1_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], [[GUARDED_US:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US]] -; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK-LABEL: define void @test_sibling_loops( +; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[ENTRY_SPLIT_US]]: +; CHECK-NEXT: br label %[[LOOP1_US:.*]] +; CHECK: [[LOOP1_US]]: +; CHECK-NEXT: [[IV1_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], %[[GUARDED_US:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US]] +; CHECK: [[GUARDED_US]]: +; CHECK-NEXT: [[IV1_NEXT_US]] = add i32 [[IV1_US]], 1 +; CHECK-NEXT: [[LOOP1_COND_US:%.*]] = icmp slt i32 [[IV1_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP1_COND_US]], label %[[LOOP1_US]], label %[[BETWEEN_SPLIT_US:.*]] +; CHECK: [[BETWEEN_SPLIT_US]]: +; CHECK-NEXT: br label %[[BETWEEN:.*]] +; CHECK: [[ENTRY_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP1:.*]] +; CHECK: [[LOOP1]]: +; CHECK-NEXT: br label %[[DEOPT:.*]] +; CHECK: [[DEOPT]]: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable -; CHECK: [[IV2_US:%.*]] = phi i32 [ 0, [[BETWEEN:%.*]] ], [ [[IV1_NEXT_US2:%.*]], [[GUARDED_US2:%.*]] ] -; CHECK-NEXT: br label [[GUARDED_US2]] -; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK: [[BETWEEN]]: +; CHECK-NEXT: br i1 [[COND2]], label %[[BETWEEN_SPLIT_US2:.*]], label %[[BETWEEN_SPLIT:.*]], !prof [[PROF1]] +; CHECK: [[BETWEEN_SPLIT_US2]]: +; CHECK-NEXT: br label %[[LOOP2_US:.*]] +; CHECK: [[LOOP2_US]]: +; CHECK-NEXT: [[IV2_US:%.*]] = phi i32 [ 0, %[[BETWEEN_SPLIT_US2]] ], [ [[IV2_NEXT_US:%.*]], %[[GUARDED_US3:.*]] ] +; CHECK-NEXT: br label %[[GUARDED_US3]] +; CHECK: [[GUARDED_US3]]: +; CHECK-NEXT: [[IV2_NEXT_US]] = add i32 [[IV2_US]], 1 +; CHECK-NEXT: [[LOOP2_COND_US:%.*]] = icmp slt i32 [[IV2_NEXT_US]], [[N]] +; CHECK-NEXT: br i1 [[LOOP2_COND_US]], label %[[LOOP2_US]], label %[[EXIT_SPLIT_US:.*]] +; CHECK: [[EXIT_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[BETWEEN_SPLIT]]: +; CHECK-NEXT: br label %[[LOOP2:.*]] +; CHECK: [[LOOP2]]: +; CHECK-NEXT: br label %[[DEOPT1:.*]] +; CHECK: [[DEOPT1]]: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: @@ -242,11 +311,21 @@ exit: } ; Check that we don't do anything because of cleanuppad. -; CHECK-LABEL: @test_cleanuppad( -; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] -; CHECK-NOT: call void (i1, ...) @llvm.experimental.guard( define void @test_cleanuppad(i1 %cond, i32 %N) personality ptr @__CxxFrameHandler3 { - +; CHECK-LABEL: define void @test_cleanuppad( +; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) personality ptr @__CxxFrameHandler3 { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[COND]]) [ "deopt"() ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: invoke void @may_throw(i32 [[IV]]) +; CHECK-NEXT: to label %[[LOOP]] unwind label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[CP:%.*]] = cleanuppad within none [] +; CHECK-NEXT: cleanupret from [[CP]] unwind to caller +; entry: br label %loop @@ -264,3 +343,9 @@ exit: declare void @may_throw(i32 %i) declare i32 @__CxxFrameHandler3(...) + +!0 = !{!"function_entry_count", i32 10} +;. +; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1048575, i32 1} +;. diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll index 536e0c6a0e74a..3c84dea2a0672 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll @@ -2,40 +2,40 @@ ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop(simple-loop-unswitch<nontrivial>),simplifycfg" | FileCheck %s ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop-mssa(simple-loop-unswitch<nontrivial>),simplifycfg" -verify-memoryssa | FileCheck %s -define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { +define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} { ; CHECK-LABEL: @test_01( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1:![0-9]+]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]] -; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] +; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]], !prof [[PROF2:![0-9]+]] ; CHECK: loop.us: -; CHECK-NEXT: [[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] -; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 -; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]] -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] +; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 +; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]] ; CHECK: guarded.us: -; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]] -; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] -; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4 -; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 +; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]] +; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] +; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR_US]], align 4 +; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] ; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] -; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 -; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[EL_PTR1:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV1]] +; CHECK-NEXT: [[EL1:%.*]] = load i32, ptr [[EL_PTR1]], align 4 +; CHECK-NEXT: [[BOUND_CHECK1:%.*]] = icmp ult i32 [[EL1]], [[LIMIT]] +; CHECK-NEXT: br i1 [[BOUND_CHECK1]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: guarded: -; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL1]], [[X]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: -; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] -; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL1]] +; CHECK-NEXT: store i32 [[IV1]], ptr [[ARR_PTR]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV1]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -76,7 +76,7 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_void_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_void_profile( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] @@ -133,7 +133,7 @@ range_check_failed: ; preds = %guarded define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_constants( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 200, 300 ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -141,7 +141,7 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 200 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 300 ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -154,13 +154,13 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], 200 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -200,17 +200,17 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_degenerate_profile( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF5:![0-9]+]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -257,17 +257,17 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_cold( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF6:![0-9]+]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF8:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -314,17 +314,17 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_overflowing_metadata(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_overflowing_metadata( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF7:![0-9]+]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF9:![0-9]+]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF9]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -371,7 +371,7 @@ range_check_failed: ; preds = %guarded define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_02( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -379,7 +379,7 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]] ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -392,16 +392,16 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -441,7 +441,7 @@ range_check_failed: ; preds = %guarded define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_02_inverse( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -449,7 +449,7 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp uge i32 [[EL_US]], [[X]] ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -462,16 +462,16 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp uge i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]], !prof [[PROF11:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -511,7 +511,7 @@ range_check_failed: ; preds = %guarded define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_03( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -519,7 +519,7 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp slt i32 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10:![0-9]+]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]] ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] @@ -532,16 +532,16 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i32 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -581,7 +581,7 @@ range_check_failed: ; preds = %guarded define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_04( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]] ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 128, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -589,7 +589,7 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i8, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp slt i8 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]] ; CHECK: guarded.us: ; CHECK-NEXT: [[EL_WIDE_US:%.*]] = zext i8 [[EL_US]] to i32 ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_WIDE_US]], [[X]] @@ -603,17 +603,17 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i8, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i8, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i8 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]] ; CHECK: guarded: ; CHECK-NEXT: [[EL_WIDE:%.*]] = zext i8 [[EL]] to i32 ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL_WIDE]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL_WIDE]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -651,17 +651,19 @@ range_check_failed: ; preds = %guarded ret i32 -2 } ;. -; CHECK: [[META0:![0-9]+]] = !{} -; CHECK: [[PROF1]] = !{!"branch_weights", i32 100, i32 1} -; CHECK: [[LOOP2]] = distinct !{!2, !3} -; CHECK: [[META3:![0-9]+]] = !{!"llvm.loop.unswitch.injection.disable"} -; CHECK: [[LOOP4]] = distinct !{!4, !3} -; CHECK: [[PROF5]] = !{!"branch_weights", i32 0, i32 0} -; CHECK: [[PROF6]] = !{!"branch_weights", i32 2, i32 3} -; CHECK: [[PROF7]] = !{!"branch_weights", i32 -1, i32 -1000} -; CHECK: [[LOOP8]] = distinct !{!8, !3} -; CHECK: [[LOOP9]] = distinct !{!9, !3} -; CHECK: [[PROF10]] = !{!"branch_weights", i32 1, i32 100} -; CHECK: [[LOOP11]] = distinct !{!11, !3} -; CHECK: [[LOOP12]] = distinct !{!12, !3} +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10} +; CHECK: [[META1]] = !{} +; CHECK: [[PROF2]] = !{!"unknown", !"simple-loop-unswitch"} +; CHECK: [[PROF3]] = !{!"branch_weights", i32 100, i32 1} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]} +; CHECK: [[META5]] = !{!"llvm.loop.unswitch.injection.disable"} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META5]]} +; CHECK: [[PROF7]] = !{!"branch_weights", i32 0, i32 0} +; CHECK: [[PROF8]] = !{!"branch_weights", i32 2, i32 3} +; CHECK: [[PROF9]] = !{!"branch_weights", i32 -1, i32 -1000} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]]} +; CHECK: [[PROF11]] = !{!"branch_weights", i32 1, i32 100} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META5]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META5]]} ;. diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll index 1d8942079ffd8..87161707d9f69 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll @@ -1,14 +1,14 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s declare void @clobber() -define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { +define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) !prof !0 { ; CHECK-LABEL: @partial_unswitch_true_successor( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 100 -; CHECK-NEXT: br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] +; CHECK-NEXT: br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]], !prof [[PROF1:![0-9]+]] ; CHECK: entry.split.us: ; CHECK-NEXT: br label [[LOOP_HEADER_US:%.*]] ; CHECK: loop.header.us: @@ -19,7 +19,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !prof [[PROF2:![0-9]+]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -28,7 +28,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[PTR]], align 4 ; CHECK-NEXT: [[SC:%.*]] = icmp eq i32 [[LV]], 100 -; CHECK-NEXT: br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]] +; CHECK-NEXT: br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]], !prof [[PROF1]] ; CHECK: noclobber: ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: clobber: @@ -37,7 +37,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !prof [[PROF2]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -50,7 +50,7 @@ loop.header: %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] %lv = load i32, ptr %ptr %sc = icmp eq i32 %lv, 100 - br i1 %sc, label %noclobber, label %clobber + br i1 %sc, label %noclobber, label %clobber, !prof !1 noclobber: br label %loop.latch @@ -62,7 +62,7 @@ clobber: loop.latch: %c = icmp ult i32 %iv, %N %iv.next = add i32 %iv, 1 - br i1 %c, label %loop.header, label %exit + br i1 %c, label %loop.header, label %exit, !prof !2 exit: ret i32 10 @@ -102,7 +102,7 @@ define i32 @partial_unswitch_false_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -171,7 +171,7 @@ define i32 @partial_unswtich_gep_load_icmp(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -246,7 +246,7 @@ define i32 @partial_unswitch_reduction_phi(ptr %ptr, i32 %N) { ; CHECK-NEXT: [[RED_NEXT]] = phi i32 [ [[ADD_5]], [[CLOBBER]] ], [ [[ADD_10]], [[NOCLOBBER]] ] ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP_LATCH]] ] ; CHECK-NEXT: br label [[EXIT]] @@ -325,7 +325,7 @@ define i32 @partial_unswitch_true_successor_noclobber(ptr noalias %ptr.1, ptr no ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -637,7 +637,7 @@ define i32 @partial_unswitch_true_successor_preheader_insertion(ptr %ptr, i32 %N ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit.loopexit.split: ; CHECK-NEXT: br label [[EXIT_LOOPEXIT]] ; CHECK: exit.loopexit: @@ -713,7 +713,7 @@ define i32 @partial_unswitch_true_successor_insert_point(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -784,7 +784,7 @@ define i32 @partial_unswitch_true_successor_hoist_invariant(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1073,7 +1073,7 @@ define i32 @partial_unswitch_true_to_latch(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1138,7 +1138,7 @@ define i32 @partial_unswitch_exiting_block_with_multiple_unswitch_candidates(i32 ; CHECK-NEXT: store i32 [[TMP1:%.*]], ptr [[PTR]], align 16 ; CHECK-NEXT: br label [[EXITING]] ; CHECK: exiting: -; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: [[RET_VAL:%.*]] = phi i32 [ 1, [[EXITING]] ] ; CHECK-NEXT: br label [[EXIT]] @@ -1249,7 +1249,7 @@ define i32 @partial_unswitch_true_successor_for_cost_calculation(ptr %ptr, i32 % ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1360,7 +1360,7 @@ define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1425,7 +1425,7 @@ define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1456,15 +1456,26 @@ exit: ret i32 10 } -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]} -; CHECK: [[UNSWITCH_PARTIAL_DISABLE]] = !{!"llvm.loop.unswitch.partial.disable"} -; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[UNSWITCH_PARTIAL_DISABLE]]} +!0 = !{!"function_entry_count", i32 10} +!1 = !{!"branch_weights", i32 1000, i32 1} +!2 = !{!"branch_weights", i32 100, i32 3} + +;. +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1000, i32 1} +; CHECK: [[PROF2]] = !{!"branch_weights", i32 100, i32 3} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]} +; CHECK: [[META4]] = !{!"llvm.loop.unswitch.partial.disable"} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META4]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META4]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META4]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]]} +; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META4]]} +;. diff --git a/llvm/test/Transforms/SimplifyCFG/pr166369.ll b/llvm/test/Transforms/SimplifyCFG/pr166369.ll new file mode 100644 index 0000000000000..c0a85c0293dd8 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/pr166369.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s + +; Make sure we handle full-set ranges correctly. +define void @test_i1() { +; CHECK-LABEL: define void @test_i1() { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: ret void +; +bb: + %icmp = icmp ugt i1 false, true + br label %bb5 + +bb5: + %select = select i1 %icmp, i1 %icmp, i1 false + br i1 %select, label %bb5, label %bb6 + +bb6: + ret void +} + +define void @test_i3() { +; CHECK-LABEL: define void @test_i3() { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: ret void +; +bb: + %icmp = icmp ugt i3 0, 7 + br label %bb5 + +bb5: + %select = select i1 %icmp, i1 %icmp, i1 false + br i1 %select, label %bb5, label %bb6 + +bb6: + ret void +} diff --git a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll index d1fba91d1e505..169803f7aa012 100644 --- a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll +++ b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll @@ -321,7 +321,7 @@ three: !1 = !{!"branch_weights", i32 5, i32 7, i32 11, i32 13, i32 17} ;. ; CHECK: attributes #[[ATTR0:[0-9]+]] = { optsize } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ;. ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100} ; CHECK: [[PROF1]] = !{!"branch_weights", i32 48, i32 5} diff --git a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll similarity index 54% rename from llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll rename to llvm/test/Transforms/SimplifyCFG/switch-on-const.ll index e8b58639c13dd..1ab1b5e8bd838 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll @@ -154,6 +154,132 @@ bees: unreachable } +define void @pr165179(i1 %cond) { +; CHECK-LABEL: @pr165179( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br label [[SWITCHBB:%.*]] +; CHECK: if.else: +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[SWITCHBB]] +; CHECK: exit: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: ret void +; +entry: + br i1 %cond, label %if.then, label %if.else + +if.then: + tail call void @bees.a() nounwind + br label %switchbb + +if.else: + tail call void @bees.b() nounwind + br label %switchbb + +switchbb: + %cond1 = phi i32 [ 1, %if.else ], [ -1, %if.then ] + switch i32 %cond1, label %default [ + i32 1, label %exit + i32 -1, label %exit + ] + +exit: + tail call void @bees.a() nounwind + ret void + +default: + tail call void @bees.b() nounwind + ret void +} + +define void @switch_remove_dead_case_phi(i1 %cond1, i1 %cond2) { +; CHECK-LABEL: @switch_remove_dead_case_phi( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND1:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br i1 [[COND2:%.*]], label [[SWITCHBB:%.*]], label [[IF_ELSE]] +; CHECK: if.else: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ -1, [[IF_THEN]] ] +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[SWITCHBB]] +; CHECK: switchbb: +; CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[PHI]], [[IF_ELSE]] ], [ 5, [[IF_THEN]] ] +; CHECK-NEXT: [[COND3:%.*]] = icmp eq i32 [[COND]], -1 +; CHECK-NEXT: br i1 [[COND3]], label [[EXIT:%.*]], label [[DEFAULT:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET:%.*]] +; CHECK: default: +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET]] +; +entry: + br i1 %cond1, label %if.then, label %if.else + +if.then: + tail call void @bees.a() nounwind + br i1 %cond2, label %switchbb, label %if.else + +if.else: + %phi = phi i32 [ 3, %entry ], [ -1, %if.then ] + tail call void @bees.b() nounwind + br label %switchbb + +switchbb: + %cond = phi i32 [ %phi, %if.else ], [ 5, %if.then ] + switch i32 %cond, label %default [ + i32 1, label %exit + i32 -1, label %exit + ] + +exit: + tail call void @bees.a() nounwind + ret void + +default: + tail call void @bees.b() nounwind + ret void +} + +define void @switch_remove_dead_case_select(i1 %cond1, i1 %cond2) { +; CHECK-LABEL: @switch_remove_dead_case_select( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = select i1 [[COND1:%.*]], i32 -1, i32 3 +; CHECK-NEXT: [[Y:%.*]] = select i1 [[COND2:%.*]], i32 [[X]], i32 5 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[Y]], -1 +; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[DEFAULT:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: tail call void @bees.a() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET:%.*]] +; CHECK: default: +; CHECK-NEXT: tail call void @bees.b() #[[ATTR0]] +; CHECK-NEXT: br label [[COMMON_RET]] +; +entry: + %x = select i1 %cond1, i32 -1, i32 3 + %y = select i1 %cond2, i32 %x, i32 5 + switch i32 %y, label %default [ + i32 1, label %exit + i32 -1, label %exit + ] + +exit: + tail call void @bees.a() nounwind + ret void + +default: + tail call void @bees.b() nounwind + ret void +} + declare void @llvm.trap() nounwind noreturn declare void @bees.a() nounwind declare void @bees.b() nounwind diff --git a/llvm/test/Transforms/SimplifyCFG/switch-umin.ll b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll new file mode 100644 index 0000000000000..44665365dc222 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll @@ -0,0 +1,246 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s + +declare void @a() +declare void @b() +declare void @c() +declare void @d() + +define void @switch_replace_default(i32 %x) { +; CHECK-LABEL: define void @switch_replace_default( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[X]], label %[[COMMON_RET:.*]] [ +; CHECK-NEXT: i32 0, label %[[CASE0:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: ], !prof [[PROF0:![0-9]+]] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE0]]: +; CHECK-NEXT: call void @a() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %unreachable [ + i32 0, label %case0 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ], !prof !0 + +case0: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @switch_replace_default_and_remove_dead_cases(i32 %x) { +; CHECK-LABEL: define void @switch_replace_default_and_remove_dead_cases( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[X]], label %[[COMMON_RET:.*]] [ +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %unreachable [ + i32 4, label %case4 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ] + +case4: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @switch_replace_default_when_holes(i32 %x) { +; CHECK-LABEL: define void @switch_replace_default_when_holes( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[X]], label %[[COMMON_RET:.*]] [ +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %unreachable [ + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ] + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @do_not_switch_replace_default(i32 %x, i32 %y) { +; CHECK-LABEL: define void @do_not_switch_replace_default( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 [[Y]]) +; CHECK-NEXT: switch i32 [[MIN]], label %[[UNREACHABLE:.*]] [ +; CHECK-NEXT: i32 0, label %[[CASE0:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: i32 3, label %[[COMMON_RET:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE0]]: +; CHECK-NEXT: call void @a() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[UNREACHABLE]]: +; CHECK-NEXT: unreachable +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 %y) + switch i32 %min, label %unreachable [ + i32 0, label %case0 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + ] + +case0: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +unreachable: + unreachable +} + +define void @do_not_replace_switch_default_but_remove_dead_cases(i32 %x) { +; CHECK-LABEL: define void @do_not_replace_switch_default_but_remove_dead_cases( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3) +; CHECK-NEXT: switch i32 [[MIN]], label %[[CASE0:.*]] [ +; CHECK-NEXT: i32 3, label %[[COMMON_RET:.*]] +; CHECK-NEXT: i32 1, label %[[CASE1:.*]] +; CHECK-NEXT: i32 2, label %[[CASE2:.*]] +; CHECK-NEXT: ] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[CASE0]]: +; CHECK-NEXT: call void @a() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE1]]: +; CHECK-NEXT: call void @b() +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[CASE2]]: +; CHECK-NEXT: call void @c() +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %min = call i32 @llvm.umin.i32(i32 %x, i32 3) + switch i32 %min, label %case0 [ ; default is reachable, therefore simplification not triggered + i32 0, label %case0 + i32 1, label %case1 + i32 2, label %case2 + i32 3, label %case3 + i32 4, label %case4 + ] + +case0: + call void @a() + ret void + +case1: + call void @b() + ret void + +case2: + call void @c() + ret void + +case3: + ret void + +case4: + call void @d() + ret void + +} + + +!0 = !{!"branch_weights", i32 1, i32 2, i32 3, i32 99, i32 5} +;. +; CHECK: [[PROF0]] = !{!"branch_weights", i32 5, i32 2, i32 3, i32 99} +;. diff --git a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll index f8bcbc057a7ae..428c18fc18e3d 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll @@ -221,6 +221,7 @@ define i1 @pr88607() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[COND:%.*]] = select i1 false, i32 4, i32 1 ; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 false, i32 2, i32 [[COND]] +; CHECK-NEXT: [[COND1:%.*]] = icmp eq i32 [[SPEC_SELECT]], 1 ; CHECK-NEXT: ret i1 false ; entry: diff --git a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll index 88a729b7d941a..4de5ea948ed27 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll @@ -5,12 +5,11 @@ define void @f6() #0 { ; CHECK-LABEL: @f6( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_COND_I:%.*]] -; CHECK: for.cond.i: +; CHECK-NEXT: br label [[F1_EXIT_I:%.*]] +; CHECK: f1.exit.i: ; CHECK-NEXT: [[TOBOOL7_I:%.*]] = icmp ne i16 1, 0 -; CHECK-NEXT: br label [[FOR_COND_I]] +; CHECK-NEXT: br label [[F1_EXIT_I]] ; - entry: br label %for.cond.i diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll new file mode 100644 index 0000000000000..42f95194980d4 --- /dev/null +++ b/llvm/test/Transforms/StructurizeCFG/callbr.ll @@ -0,0 +1,235 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s + +; Structurize as usual, but don't tear callbr and its destination blocks apart. +; +; Note: currently, callbr blocks and their corresponding target blocks +; themselves are not handled by the structurizer.* If the CFG turns out to be +; unstructured at the end, the CFG lowering (si-annotate-control-flow) will +; detect this. For the currently intended use cases of callbr in the context of +; the AMDGPU backend, this is not a limitation (cf. +; https://discourse.llvm.org/t/rfc-add-callbr-intrinsic-support/86087). +; +; Note 2: while callbr and its targets remain untouched, everything else is +; handled as usual, even if it is nested in a callbr region. +; +; *FIXME: this will be fixed in the future. Callbr can be handled as follows: +; Input IR: +; ``` +; define void @foo_callbr() { +; callbr void asm "", "!i"() to label %fallthrough [label %indirect, ...] +; fallthrough: +; br label %exit +; indirect: +; br label %exit +; ... +; exit: +; ret void +; } +; ``` +; +; Output IR: +; ``` +; define void @foo_callbr() { +; callbr void asm "", "!i"() +; to label %fallthrough [label %fake.indirect, label %fake.indirect1, label %fake.indirect2, ...] +; fake.indirect: ; preds = %0 +; br label %Flow +; fake.indirect1: ; preds = %0 +; br label %Flow +; fake.indirect2: ; preds = %0 +; br label %Flow +; ... +; Flow: ; preds = %fallthrough, %fake.indirect[0-N] +; %1 = phi i1 [ false, %fallthrough ], [ true, %fake.indirect ], [ false, %fake.indirect[1-N] ] +; br i1 %1, label %indirect, label %Flow1 +; Flow1: ; preds = %Flow, %indirect +; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect1 ], [ false, %indirect ] +; br i1 %2, label %indirect1, label %Flow2 +; Flow2: ; preds = %Flow, %indirect1 +; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect2 ], [ false, %indirect1 ] +; br i1 %2, label %indirect2, label %Flow3 +; ... +; fallthrough: ; preds = %0 +; br label %Flow +; indirect: ; preds = %Flow +; br label %Flow1 +; indirect1: ; preds = %Flow1 +; br label %Flow2 +; indirect2: : preds = %Flow2 +; br label %Flow3 +; ... +; exit: ; preds = %indirectN, %FlowN +; ret void +; } +; ``` +; +; Output IR as ASCII-art: +; %0 +; --------------------- +; | | | | +; v v v v +; f f.i f.i1 f.i2 +; | | | | +; v v v v +; --------------------- +; %Flow +; | \ +; | %indirect +; | / +; %Flow1 +; | \ +; | %indirect1 +; | / +; %Flow2 +; | \ +; | %indirect2 +; | / +; %exit +; + +; Only callbr, nothing to do. +define void @callbr_simple() { +; CHECK-LABEL: define void @callbr_simple() { +; CHECK-NEXT: [[CALLBR:.*:]] +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br label %exit +indirect: + br label %exit +exit: + ret void +} + +; Callbr nested in non-callbr: non-callbr is transformed +define void @callbr_in_non_callbr(i1 %c) { +; CHECK-LABEL: define void @callbr_in_non_callbr( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW:.*]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[NOCALLBR]] ], [ true, [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[EXIT:.*]] +; CHECK: [[CALLBR]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[NOCALLBR]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + br i1 %c, label %callbr, label %nocallbr +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br label %exit +indirect: + br label %exit +nocallbr: + br label %exit +exit: + ret void +} + +; Callbr parent of non-callbr: non-callbr is transformed +define void @non_callbr_in_callbr(i1 %c) { +; CHECK-LABEL: define void @non_callbr_in_callbr( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br i1 [[C_INV]], label %[[FALLTHROUGH2:.*]], label %[[FLOW:.*]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FALLTHROUGH2]] ], [ true, %[[INDIRECT]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[FALLTHROUGH1:.*]], label %[[FLOW1:.*]] +; CHECK: [[FALLTHROUGH1]]: +; CHECK-NEXT: br label %[[FLOW1]] +; CHECK: [[FALLTHROUGH2]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[INDIRECT1:.*:]] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[FLOW1]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br i1 %c, label %fallthrough1, label %fallthrough2 +fallthrough1: + br label %exit +fallthrough2: + br label %exit +indirect: + br label %exit +exit: + ret void +} + +; Callbr surrounded by non-callbr: all three regular branches are handled +; correctly +define void @callbr_nested_in_non_callbr(i1 %c, i1 %d, i1 %e, i1 %f) { +; CHECK-LABEL: define void @callbr_nested_in_non_callbr( +; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]], i1 [[F:%.*]]) { +; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true +; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]] +; CHECK: [[FLOW3]]: +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]] +; CHECK: [[CALLBR]]: +; CHECK-NEXT: callbr void asm "", "!i"() +; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect] +; CHECK: [[INDIRECT]]: +; CHECK-NEXT: br i1 [[D]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]] +; CHECK: [[FALLTHROUGH1]]: +; CHECK-NEXT: br label %[[FLOW2]] +; CHECK: [[INDIRECT2:.*:]] +; CHECK-NEXT: br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]] +; CHECK: [[INDIRECT1]]: +; CHECK-NEXT: br label %[[FLOW1]] +; CHECK: [[NOCALLBR]]: +; CHECK-NEXT: br i1 [[F]], label %[[NOCALLBR1:.*]], label %[[FLOW]] +; CHECK: [[NOCALLBR1]]: +; CHECK-NEXT: br label %[[FLOW]] +; CHECK: [[FLOW]]: +; CHECK-NEXT: br label %[[FLOW3]] +; CHECK: [[FLOW1]]: +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[FLOW2]]: +; CHECK-NEXT: br label %[[RET]] +; CHECK: [[RET]]: +; CHECK-NEXT: ret void +; + br i1 %c, label %callbr, label %nocallbr +callbr: + callbr void asm "", "!i"() to label %fallthrough [label %indirect] +fallthrough: + br i1 %d, label %fallthrough1, label %ret +fallthrough1: + br label %ret +indirect: + br i1 %e, label %indirect1, label %ret +indirect1: + br label %ret +nocallbr: + br i1 %f, label %nocallbr1, label %ret +nocallbr1: + br label %ret +ret: + ret void +} diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll index ee3a0539bf300..c005316f07f06 100644 --- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll @@ -11,5 +11,9 @@ define float @sinf(float %x) { } ; CHECK: declare void @acosf(...) + +; CHECK: declare nofpclass(ninf nsub nnorm) float @sqrtf(float) [[SQRT_ATTRS:#[0-9]+]] +; CHECK: declare nofpclass(ninf nsub nnorm) double @sqrt(double) [[SQRT_ATTRS:#[0-9]+]] + ; CHECK: declare void @__umodti3(...) diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll new file mode 100644 index 0000000000000..ffbf11d4106dc --- /dev/null +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll @@ -0,0 +1,11 @@ +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define noundef nofpclass(nan) float @sqrtf(float %x) "foo" { + %ret = call float asm "; $0 = sqrt($1)", "=r,r"(float %x) + ret float %ret +} + +; FIXME: Individual fields of nofpclass not merged +; CHECK: define noundef nofpclass(ninf nsub nnorm) float @sqrtf(float %x) [[SQRT_ATTR:#[0-9]+]] { + +; CHECK: attributes [[SQRT_ATTR]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) "foo" } diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll new file mode 100644 index 0000000000000..f0f09e97d9dba --- /dev/null +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll @@ -0,0 +1,23 @@ +; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,X64 %s %} +; RUN: %if aarch64-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=arm64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %} +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %} +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %} +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %} + +; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s %} +; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s %} + +; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]] +; X64: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]] + +; STRUCT: declare { float, float } @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]] +; STRUCT: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]] + +; SRET: declare void @__sincosf_stret(ptr sret({ float, float }) align 4, float) [[SINCOS_ATTRS:#[0-9]+]] +; SRET: declare void @__sincos_stret(ptr sret({ double, double }) align 4, double) [[SINCOS_ATTRS:#[0-9]+]] + +; CHECK: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) } +; SRET: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write, errnomem: write) } + +; NONE-NOT: __sincos_stret +; NONE-NOT: __sincosf_stret diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll new file mode 100644 index 0000000000000..2451010df5b75 --- /dev/null +++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll @@ -0,0 +1,21 @@ +; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck %s + +; Make sure there is no crash if there are definitions or declarations +; with the wrong type signature. + +; CHECK: define void @sqrtf() { +define void @sqrtf() { + ret void +} + +; CHECK: define float @sqrt(float %0) { +define float @sqrt(float) { + ret float 0.0 +} + +; CHECK: declare double @__sincos_stret(double) +declare double @__sincos_stret(double) + +; CHECK: declare { float, float } @__sincosf_stret(float) +declare { float, float } @__sincosf_stret(float) + diff --git a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll index 60700412686ea..e7b11cdf8475e 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll @@ -346,3 +346,189 @@ entry: call void @use.i32(i32 %ext.3) ret void } + +define noundef i32 @zext_v4i8_all_lanes_used_no_freeze(<4 x i8> %src) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_all_lanes_used_no_freeze( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i8> [[SRC]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 24 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 255 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP0]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 255 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP3]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.2 = extractelement <4 x i32> %ext, i64 2 + %ext.3 = extractelement <4 x i32> %ext, i64 3 + + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.2 + %add3 = add i32 %add2, %ext.3 + ret i32 %add3 +} + +define noundef i32 @zext_v4i8_not_all_lanes_used(<4 x i8> %src) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_not_all_lanes_used( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP2:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i8> [[TMP2]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 24 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP0]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 255 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP0]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.3 = extractelement <4 x i32> %ext, i64 3 + + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.3 + ret i32 %add2 +} + +define i32 @zext_v4i8_all_lanes_used_no_ub(<4 x i8> %src) { +; CHECK-LABEL: define i32 @zext_v4i8_all_lanes_used_no_ub( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], 255 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP1]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.2 = extractelement <4 x i32> %ext, i64 2 + %ext.3 = extractelement <4 x i32> %ext, i64 3 + + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.2 + %add3 = add i32 %add2, %ext.3 + ret i32 %add3 +} + +define noundef i32 @zext_v4i8_extracts_different_blocks(<4 x i8> %src, i1 %cond) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_extracts_different_blocks( +; CHECK-SAME: <4 x i8> [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], 255 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP1]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP4]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[PHI]] +; CHECK-NEXT: ret i32 [[ADD2]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + br i1 %cond, label %then, label %else + +then: + %ext.2 = extractelement <4 x i32> %ext, i64 2 + br label %exit + +else: + %ext.3 = extractelement <4 x i32> %ext, i64 3 + br label %exit + +exit: + %phi = phi i32 [ %ext.2, %then ], [ %ext.3, %else ] + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %phi + ret i32 %add2 +} + + +declare void @may_throw() willreturn + +define noundef i32 @zext_v4i8_throwing_call_between(<4 x i8> %src) { +; CHECK-LABEL: define noundef i32 @zext_v4i8_throwing_call_between( +; CHECK-SAME: <4 x i8> [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = freeze <4 x i8> [[SRC]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 24 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP1]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], 255 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP1]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 255 +; CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP1]], 255 +; CHECK-NEXT: [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32> +; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0 +; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1 +; CHECK-NEXT: [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2 +; CHECK-NEXT: call void @may_throw() +; CHECK-NEXT: [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3 +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]] +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]] +; CHECK-NEXT: ret i32 [[ADD3]] +; +entry: + %ext = zext nneg <4 x i8> %src to <4 x i32> + %ext.0 = extractelement <4 x i32> %ext, i64 0 + %ext.1 = extractelement <4 x i32> %ext, i64 1 + %ext.2 = extractelement <4 x i32> %ext, i64 2 + call void @may_throw() + %ext.3 = extractelement <4 x i32> %ext, i64 3 + %add1 = add i32 %ext.0, %ext.1 + %add2 = add i32 %add1, %ext.2 + %add3 = add i32 %add2, %ext.3 + ret i32 %add3 +} diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll new file mode 100644 index 0000000000000..4b551fad5b43a --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll @@ -0,0 +1,567 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=vector-combine < %s | FileCheck -check-prefix=OPT %s + +; Generated from amdgpu-promote-alloca on array of vectors +; VectorCombiner should recognize chain of extract-insert vectors +; and turn them into one or two shuffles +define amdgpu_kernel void @extract_insert_chain_to_shuffles(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 { +; OPT-LABEL: define amdgpu_kernel void @extract_insert_chain_to_shuffles( +; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[ENTRY:.*:]] +; OPT-NEXT: [[ALLOCA:%.*]] = freeze <128 x i8> poison +; OPT-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP1:%.*]] = shufflevector <128 x i8> [[ALLOCA]], <128 x i8> [[TMP0]], <128 x i32> <i32 128, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1 +; OPT-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3 +; OPT-NEXT: [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4 +; OPT-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5 +; OPT-NEXT: [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6 +; OPT-NEXT: [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7 +; OPT-NEXT: [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8 +; OPT-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9 +; OPT-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10 +; OPT-NEXT: [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11 +; OPT-NEXT: [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12 +; OPT-NEXT: [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13 +; OPT-NEXT: [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14 +; OPT-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15 +; OPT-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP33:%.*]] = shufflevector <128 x i8> [[TMP31]], <128 x i8> [[TMP32]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 128, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 17 +; OPT-NEXT: [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 18 +; OPT-NEXT: [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 19 +; OPT-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 20 +; OPT-NEXT: [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 21 +; OPT-NEXT: [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 22 +; OPT-NEXT: [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 23 +; OPT-NEXT: [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 24 +; OPT-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 25 +; OPT-NEXT: [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 26 +; OPT-NEXT: [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 27 +; OPT-NEXT: [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 28 +; OPT-NEXT: [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 29 +; OPT-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 30 +; OPT-NEXT: [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 31 +; OPT-NEXT: [[TMP64:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP65:%.*]] = shufflevector <128 x i8> [[TMP63]], <128 x i8> [[TMP64]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 128, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 33 +; OPT-NEXT: [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 34 +; OPT-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 35 +; OPT-NEXT: [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 36 +; OPT-NEXT: [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 37 +; OPT-NEXT: [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 38 +; OPT-NEXT: [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 39 +; OPT-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 40 +; OPT-NEXT: [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 41 +; OPT-NEXT: [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 42 +; OPT-NEXT: [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 43 +; OPT-NEXT: [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 44 +; OPT-NEXT: [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 45 +; OPT-NEXT: [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 46 +; OPT-NEXT: [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 47 +; OPT-NEXT: [[TMP96:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP97:%.*]] = shufflevector <128 x i8> [[TMP95]], <128 x i8> [[TMP96]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 128, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 49 +; OPT-NEXT: [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 50 +; OPT-NEXT: [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 51 +; OPT-NEXT: [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 52 +; OPT-NEXT: [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 53 +; OPT-NEXT: [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 54 +; OPT-NEXT: [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 55 +; OPT-NEXT: [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 56 +; OPT-NEXT: [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 57 +; OPT-NEXT: [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 58 +; OPT-NEXT: [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 59 +; OPT-NEXT: [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 60 +; OPT-NEXT: [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 61 +; OPT-NEXT: [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 62 +; OPT-NEXT: [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 63 +; OPT-NEXT: [[TMP128:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP129:%.*]] = shufflevector <128 x i8> [[TMP127]], <128 x i8> [[TMP128]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 128, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 65 +; OPT-NEXT: [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 66 +; OPT-NEXT: [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 67 +; OPT-NEXT: [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 68 +; OPT-NEXT: [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 69 +; OPT-NEXT: [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 70 +; OPT-NEXT: [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 71 +; OPT-NEXT: [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 72 +; OPT-NEXT: [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 73 +; OPT-NEXT: [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 74 +; OPT-NEXT: [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 75 +; OPT-NEXT: [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 76 +; OPT-NEXT: [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 77 +; OPT-NEXT: [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 78 +; OPT-NEXT: [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 79 +; OPT-NEXT: [[TMP160:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP161:%.*]] = shufflevector <128 x i8> [[TMP159]], <128 x i8> [[TMP160]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 128, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 81 +; OPT-NEXT: [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 82 +; OPT-NEXT: [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 83 +; OPT-NEXT: [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 84 +; OPT-NEXT: [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 85 +; OPT-NEXT: [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 86 +; OPT-NEXT: [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 87 +; OPT-NEXT: [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 88 +; OPT-NEXT: [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 89 +; OPT-NEXT: [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 90 +; OPT-NEXT: [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 91 +; OPT-NEXT: [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 92 +; OPT-NEXT: [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 93 +; OPT-NEXT: [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 94 +; OPT-NEXT: [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 95 +; OPT-NEXT: [[TMP192:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP193:%.*]] = shufflevector <128 x i8> [[TMP191]], <128 x i8> [[TMP192]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 128, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 97 +; OPT-NEXT: [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 98 +; OPT-NEXT: [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 99 +; OPT-NEXT: [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 100 +; OPT-NEXT: [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 101 +; OPT-NEXT: [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 102 +; OPT-NEXT: [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 103 +; OPT-NEXT: [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 104 +; OPT-NEXT: [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 105 +; OPT-NEXT: [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 106 +; OPT-NEXT: [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 107 +; OPT-NEXT: [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 108 +; OPT-NEXT: [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 109 +; OPT-NEXT: [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 110 +; OPT-NEXT: [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 111 +; OPT-NEXT: [[TMP224:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP225:%.*]] = shufflevector <128 x i8> [[TMP223]], <128 x i8> [[TMP224]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 128, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> +; OPT-NEXT: [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1 +; OPT-NEXT: [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 113 +; OPT-NEXT: [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2 +; OPT-NEXT: [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 114 +; OPT-NEXT: [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3 +; OPT-NEXT: [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 115 +; OPT-NEXT: [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4 +; OPT-NEXT: [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 116 +; OPT-NEXT: [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5 +; OPT-NEXT: [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 117 +; OPT-NEXT: [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6 +; OPT-NEXT: [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 118 +; OPT-NEXT: [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7 +; OPT-NEXT: [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 119 +; OPT-NEXT: [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8 +; OPT-NEXT: [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 120 +; OPT-NEXT: [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9 +; OPT-NEXT: [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 121 +; OPT-NEXT: [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10 +; OPT-NEXT: [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 122 +; OPT-NEXT: [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11 +; OPT-NEXT: [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 123 +; OPT-NEXT: [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12 +; OPT-NEXT: [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 124 +; OPT-NEXT: [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13 +; OPT-NEXT: [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 125 +; OPT-NEXT: [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14 +; OPT-NEXT: [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 126 +; OPT-NEXT: [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15 +; OPT-NEXT: [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 127 +; OPT-NEXT: [[TMP256:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> +; OPT-NEXT: [[TMP257:%.*]] = insertelement <16 x i8> [[TMP256]], i8 [[TMP162]], i64 1 +; OPT-NEXT: [[TMP258:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP164]], i64 2 +; OPT-NEXT: [[TMP259:%.*]] = insertelement <16 x i8> [[TMP258]], i8 [[TMP166]], i64 3 +; OPT-NEXT: [[TMP260:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP168]], i64 4 +; OPT-NEXT: [[TMP261:%.*]] = insertelement <16 x i8> [[TMP260]], i8 [[TMP170]], i64 5 +; OPT-NEXT: [[TMP262:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP172]], i64 6 +; OPT-NEXT: [[TMP263:%.*]] = insertelement <16 x i8> [[TMP262]], i8 [[TMP174]], i64 7 +; OPT-NEXT: [[TMP264:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP176]], i64 8 +; OPT-NEXT: [[TMP265:%.*]] = insertelement <16 x i8> [[TMP264]], i8 [[TMP178]], i64 9 +; OPT-NEXT: [[TMP266:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP180]], i64 10 +; OPT-NEXT: [[TMP267:%.*]] = insertelement <16 x i8> [[TMP266]], i8 [[TMP182]], i64 11 +; OPT-NEXT: [[TMP268:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP184]], i64 12 +; OPT-NEXT: [[TMP269:%.*]] = insertelement <16 x i8> [[TMP268]], i8 [[TMP186]], i64 13 +; OPT-NEXT: [[TMP270:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP188]], i64 14 +; OPT-NEXT: [[TMP271:%.*]] = shufflevector <16 x i8> [[TMP270]], <16 x i8> [[IN]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31> +; OPT-NEXT: [[SUM:%.*]] = add <16 x i8> [[TMP271]], [[ADD]] +; OPT-NEXT: store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16 +; OPT-NEXT: ret void +; +entry: + %alloca = freeze <128 x i8> poison + %0 = extractelement <16 x i8> %in, i64 0 + %1 = insertelement <128 x i8> %alloca, i8 %0, i32 0 + %2 = extractelement <16 x i8> %in, i64 1 + %3 = insertelement <128 x i8> %1, i8 %2, i32 1 + %4 = extractelement <16 x i8> %in, i64 2 + %5 = insertelement <128 x i8> %3, i8 %4, i32 2 + %6 = extractelement <16 x i8> %in, i64 3 + %7 = insertelement <128 x i8> %5, i8 %6, i32 3 + %8 = extractelement <16 x i8> %in, i64 4 + %9 = insertelement <128 x i8> %7, i8 %8, i32 4 + %10 = extractelement <16 x i8> %in, i64 5 + %11 = insertelement <128 x i8> %9, i8 %10, i32 5 + %12 = extractelement <16 x i8> %in, i64 6 + %13 = insertelement <128 x i8> %11, i8 %12, i32 6 + %14 = extractelement <16 x i8> %in, i64 7 + %15 = insertelement <128 x i8> %13, i8 %14, i32 7 + %16 = extractelement <16 x i8> %in, i64 8 + %17 = insertelement <128 x i8> %15, i8 %16, i32 8 + %18 = extractelement <16 x i8> %in, i64 9 + %19 = insertelement <128 x i8> %17, i8 %18, i32 9 + %20 = extractelement <16 x i8> %in, i64 10 + %21 = insertelement <128 x i8> %19, i8 %20, i32 10 + %22 = extractelement <16 x i8> %in, i64 11 + %23 = insertelement <128 x i8> %21, i8 %22, i32 11 + %24 = extractelement <16 x i8> %in, i64 12 + %25 = insertelement <128 x i8> %23, i8 %24, i32 12 + %26 = extractelement <16 x i8> %in, i64 13 + %27 = insertelement <128 x i8> %25, i8 %26, i32 13 + %28 = extractelement <16 x i8> %in, i64 14 + %29 = insertelement <128 x i8> %27, i8 %28, i32 14 + %30 = extractelement <16 x i8> %in, i64 15 + %31 = insertelement <128 x i8> %29, i8 %30, i32 15 + %32 = extractelement <16 x i8> %in, i64 0 + %33 = insertelement <128 x i8> %31, i8 %32, i32 16 + %34 = extractelement <16 x i8> %in, i64 1 + %35 = insertelement <128 x i8> %33, i8 %34, i32 17 + %36 = extractelement <16 x i8> %in, i64 2 + %37 = insertelement <128 x i8> %35, i8 %36, i32 18 + %38 = extractelement <16 x i8> %in, i64 3 + %39 = insertelement <128 x i8> %37, i8 %38, i32 19 + %40 = extractelement <16 x i8> %in, i64 4 + %41 = insertelement <128 x i8> %39, i8 %40, i32 20 + %42 = extractelement <16 x i8> %in, i64 5 + %43 = insertelement <128 x i8> %41, i8 %42, i32 21 + %44 = extractelement <16 x i8> %in, i64 6 + %45 = insertelement <128 x i8> %43, i8 %44, i32 22 + %46 = extractelement <16 x i8> %in, i64 7 + %47 = insertelement <128 x i8> %45, i8 %46, i32 23 + %48 = extractelement <16 x i8> %in, i64 8 + %49 = insertelement <128 x i8> %47, i8 %48, i32 24 + %50 = extractelement <16 x i8> %in, i64 9 + %51 = insertelement <128 x i8> %49, i8 %50, i32 25 + %52 = extractelement <16 x i8> %in, i64 10 + %53 = insertelement <128 x i8> %51, i8 %52, i32 26 + %54 = extractelement <16 x i8> %in, i64 11 + %55 = insertelement <128 x i8> %53, i8 %54, i32 27 + %56 = extractelement <16 x i8> %in, i64 12 + %57 = insertelement <128 x i8> %55, i8 %56, i32 28 + %58 = extractelement <16 x i8> %in, i64 13 + %59 = insertelement <128 x i8> %57, i8 %58, i32 29 + %60 = extractelement <16 x i8> %in, i64 14 + %61 = insertelement <128 x i8> %59, i8 %60, i32 30 + %62 = extractelement <16 x i8> %in, i64 15 + %63 = insertelement <128 x i8> %61, i8 %62, i32 31 + %64 = extractelement <16 x i8> %in, i64 0 + %65 = insertelement <128 x i8> %63, i8 %64, i32 32 + %66 = extractelement <16 x i8> %in, i64 1 + %67 = insertelement <128 x i8> %65, i8 %66, i32 33 + %68 = extractelement <16 x i8> %in, i64 2 + %69 = insertelement <128 x i8> %67, i8 %68, i32 34 + %70 = extractelement <16 x i8> %in, i64 3 + %71 = insertelement <128 x i8> %69, i8 %70, i32 35 + %72 = extractelement <16 x i8> %in, i64 4 + %73 = insertelement <128 x i8> %71, i8 %72, i32 36 + %74 = extractelement <16 x i8> %in, i64 5 + %75 = insertelement <128 x i8> %73, i8 %74, i32 37 + %76 = extractelement <16 x i8> %in, i64 6 + %77 = insertelement <128 x i8> %75, i8 %76, i32 38 + %78 = extractelement <16 x i8> %in, i64 7 + %79 = insertelement <128 x i8> %77, i8 %78, i32 39 + %80 = extractelement <16 x i8> %in, i64 8 + %81 = insertelement <128 x i8> %79, i8 %80, i32 40 + %82 = extractelement <16 x i8> %in, i64 9 + %83 = insertelement <128 x i8> %81, i8 %82, i32 41 + %84 = extractelement <16 x i8> %in, i64 10 + %85 = insertelement <128 x i8> %83, i8 %84, i32 42 + %86 = extractelement <16 x i8> %in, i64 11 + %87 = insertelement <128 x i8> %85, i8 %86, i32 43 + %88 = extractelement <16 x i8> %in, i64 12 + %89 = insertelement <128 x i8> %87, i8 %88, i32 44 + %90 = extractelement <16 x i8> %in, i64 13 + %91 = insertelement <128 x i8> %89, i8 %90, i32 45 + %92 = extractelement <16 x i8> %in, i64 14 + %93 = insertelement <128 x i8> %91, i8 %92, i32 46 + %94 = extractelement <16 x i8> %in, i64 15 + %95 = insertelement <128 x i8> %93, i8 %94, i32 47 + %96 = extractelement <16 x i8> %in, i64 0 + %97 = insertelement <128 x i8> %95, i8 %96, i32 48 + %98 = extractelement <16 x i8> %in, i64 1 + %99 = insertelement <128 x i8> %97, i8 %98, i32 49 + %100 = extractelement <16 x i8> %in, i64 2 + %101 = insertelement <128 x i8> %99, i8 %100, i32 50 + %102 = extractelement <16 x i8> %in, i64 3 + %103 = insertelement <128 x i8> %101, i8 %102, i32 51 + %104 = extractelement <16 x i8> %in, i64 4 + %105 = insertelement <128 x i8> %103, i8 %104, i32 52 + %106 = extractelement <16 x i8> %in, i64 5 + %107 = insertelement <128 x i8> %105, i8 %106, i32 53 + %108 = extractelement <16 x i8> %in, i64 6 + %109 = insertelement <128 x i8> %107, i8 %108, i32 54 + %110 = extractelement <16 x i8> %in, i64 7 + %111 = insertelement <128 x i8> %109, i8 %110, i32 55 + %112 = extractelement <16 x i8> %in, i64 8 + %113 = insertelement <128 x i8> %111, i8 %112, i32 56 + %114 = extractelement <16 x i8> %in, i64 9 + %115 = insertelement <128 x i8> %113, i8 %114, i32 57 + %116 = extractelement <16 x i8> %in, i64 10 + %117 = insertelement <128 x i8> %115, i8 %116, i32 58 + %118 = extractelement <16 x i8> %in, i64 11 + %119 = insertelement <128 x i8> %117, i8 %118, i32 59 + %120 = extractelement <16 x i8> %in, i64 12 + %121 = insertelement <128 x i8> %119, i8 %120, i32 60 + %122 = extractelement <16 x i8> %in, i64 13 + %123 = insertelement <128 x i8> %121, i8 %122, i32 61 + %124 = extractelement <16 x i8> %in, i64 14 + %125 = insertelement <128 x i8> %123, i8 %124, i32 62 + %126 = extractelement <16 x i8> %in, i64 15 + %127 = insertelement <128 x i8> %125, i8 %126, i32 63 + %128 = extractelement <16 x i8> %in, i64 0 + %129 = insertelement <128 x i8> %127, i8 %128, i32 64 + %130 = extractelement <16 x i8> %in, i64 1 + %131 = insertelement <128 x i8> %129, i8 %130, i32 65 + %132 = extractelement <16 x i8> %in, i64 2 + %133 = insertelement <128 x i8> %131, i8 %132, i32 66 + %134 = extractelement <16 x i8> %in, i64 3 + %135 = insertelement <128 x i8> %133, i8 %134, i32 67 + %136 = extractelement <16 x i8> %in, i64 4 + %137 = insertelement <128 x i8> %135, i8 %136, i32 68 + %138 = extractelement <16 x i8> %in, i64 5 + %139 = insertelement <128 x i8> %137, i8 %138, i32 69 + %140 = extractelement <16 x i8> %in, i64 6 + %141 = insertelement <128 x i8> %139, i8 %140, i32 70 + %142 = extractelement <16 x i8> %in, i64 7 + %143 = insertelement <128 x i8> %141, i8 %142, i32 71 + %144 = extractelement <16 x i8> %in, i64 8 + %145 = insertelement <128 x i8> %143, i8 %144, i32 72 + %146 = extractelement <16 x i8> %in, i64 9 + %147 = insertelement <128 x i8> %145, i8 %146, i32 73 + %148 = extractelement <16 x i8> %in, i64 10 + %149 = insertelement <128 x i8> %147, i8 %148, i32 74 + %150 = extractelement <16 x i8> %in, i64 11 + %151 = insertelement <128 x i8> %149, i8 %150, i32 75 + %152 = extractelement <16 x i8> %in, i64 12 + %153 = insertelement <128 x i8> %151, i8 %152, i32 76 + %154 = extractelement <16 x i8> %in, i64 13 + %155 = insertelement <128 x i8> %153, i8 %154, i32 77 + %156 = extractelement <16 x i8> %in, i64 14 + %157 = insertelement <128 x i8> %155, i8 %156, i32 78 + %158 = extractelement <16 x i8> %in, i64 15 + %159 = insertelement <128 x i8> %157, i8 %158, i32 79 + %160 = extractelement <16 x i8> %in, i64 0 + %161 = insertelement <128 x i8> %159, i8 %160, i32 80 + %162 = extractelement <16 x i8> %in, i64 1 + %163 = insertelement <128 x i8> %161, i8 %162, i32 81 + %164 = extractelement <16 x i8> %in, i64 2 + %165 = insertelement <128 x i8> %163, i8 %164, i32 82 + %166 = extractelement <16 x i8> %in, i64 3 + %167 = insertelement <128 x i8> %165, i8 %166, i32 83 + %168 = extractelement <16 x i8> %in, i64 4 + %169 = insertelement <128 x i8> %167, i8 %168, i32 84 + %170 = extractelement <16 x i8> %in, i64 5 + %171 = insertelement <128 x i8> %169, i8 %170, i32 85 + %172 = extractelement <16 x i8> %in, i64 6 + %173 = insertelement <128 x i8> %171, i8 %172, i32 86 + %174 = extractelement <16 x i8> %in, i64 7 + %175 = insertelement <128 x i8> %173, i8 %174, i32 87 + %176 = extractelement <16 x i8> %in, i64 8 + %177 = insertelement <128 x i8> %175, i8 %176, i32 88 + %178 = extractelement <16 x i8> %in, i64 9 + %179 = insertelement <128 x i8> %177, i8 %178, i32 89 + %180 = extractelement <16 x i8> %in, i64 10 + %181 = insertelement <128 x i8> %179, i8 %180, i32 90 + %182 = extractelement <16 x i8> %in, i64 11 + %183 = insertelement <128 x i8> %181, i8 %182, i32 91 + %184 = extractelement <16 x i8> %in, i64 12 + %185 = insertelement <128 x i8> %183, i8 %184, i32 92 + %186 = extractelement <16 x i8> %in, i64 13 + %187 = insertelement <128 x i8> %185, i8 %186, i32 93 + %188 = extractelement <16 x i8> %in, i64 14 + %189 = insertelement <128 x i8> %187, i8 %188, i32 94 + %190 = extractelement <16 x i8> %in, i64 15 + %191 = insertelement <128 x i8> %189, i8 %190, i32 95 + %192 = extractelement <16 x i8> %in, i64 0 + %193 = insertelement <128 x i8> %191, i8 %192, i32 96 + %194 = extractelement <16 x i8> %in, i64 1 + %195 = insertelement <128 x i8> %193, i8 %194, i32 97 + %196 = extractelement <16 x i8> %in, i64 2 + %197 = insertelement <128 x i8> %195, i8 %196, i32 98 + %198 = extractelement <16 x i8> %in, i64 3 + %199 = insertelement <128 x i8> %197, i8 %198, i32 99 + %200 = extractelement <16 x i8> %in, i64 4 + %201 = insertelement <128 x i8> %199, i8 %200, i32 100 + %202 = extractelement <16 x i8> %in, i64 5 + %203 = insertelement <128 x i8> %201, i8 %202, i32 101 + %204 = extractelement <16 x i8> %in, i64 6 + %205 = insertelement <128 x i8> %203, i8 %204, i32 102 + %206 = extractelement <16 x i8> %in, i64 7 + %207 = insertelement <128 x i8> %205, i8 %206, i32 103 + %208 = extractelement <16 x i8> %in, i64 8 + %209 = insertelement <128 x i8> %207, i8 %208, i32 104 + %210 = extractelement <16 x i8> %in, i64 9 + %211 = insertelement <128 x i8> %209, i8 %210, i32 105 + %212 = extractelement <16 x i8> %in, i64 10 + %213 = insertelement <128 x i8> %211, i8 %212, i32 106 + %214 = extractelement <16 x i8> %in, i64 11 + %215 = insertelement <128 x i8> %213, i8 %214, i32 107 + %216 = extractelement <16 x i8> %in, i64 12 + %217 = insertelement <128 x i8> %215, i8 %216, i32 108 + %218 = extractelement <16 x i8> %in, i64 13 + %219 = insertelement <128 x i8> %217, i8 %218, i32 109 + %220 = extractelement <16 x i8> %in, i64 14 + %221 = insertelement <128 x i8> %219, i8 %220, i32 110 + %222 = extractelement <16 x i8> %in, i64 15 + %223 = insertelement <128 x i8> %221, i8 %222, i32 111 + %224 = extractelement <16 x i8> %in, i64 0 + %225 = insertelement <128 x i8> %223, i8 %224, i32 112 + %226 = extractelement <16 x i8> %in, i64 1 + %227 = insertelement <128 x i8> %225, i8 %226, i32 113 + %228 = extractelement <16 x i8> %in, i64 2 + %229 = insertelement <128 x i8> %227, i8 %228, i32 114 + %230 = extractelement <16 x i8> %in, i64 3 + %231 = insertelement <128 x i8> %229, i8 %230, i32 115 + %232 = extractelement <16 x i8> %in, i64 4 + %233 = insertelement <128 x i8> %231, i8 %232, i32 116 + %234 = extractelement <16 x i8> %in, i64 5 + %235 = insertelement <128 x i8> %233, i8 %234, i32 117 + %236 = extractelement <16 x i8> %in, i64 6 + %237 = insertelement <128 x i8> %235, i8 %236, i32 118 + %238 = extractelement <16 x i8> %in, i64 7 + %239 = insertelement <128 x i8> %237, i8 %238, i32 119 + %240 = extractelement <16 x i8> %in, i64 8 + %241 = insertelement <128 x i8> %239, i8 %240, i32 120 + %242 = extractelement <16 x i8> %in, i64 9 + %243 = insertelement <128 x i8> %241, i8 %242, i32 121 + %244 = extractelement <16 x i8> %in, i64 10 + %245 = insertelement <128 x i8> %243, i8 %244, i32 122 + %246 = extractelement <16 x i8> %in, i64 11 + %247 = insertelement <128 x i8> %245, i8 %246, i32 123 + %248 = extractelement <16 x i8> %in, i64 12 + %249 = insertelement <128 x i8> %247, i8 %248, i32 124 + %250 = extractelement <16 x i8> %in, i64 13 + %251 = insertelement <128 x i8> %249, i8 %250, i32 125 + %252 = extractelement <16 x i8> %in, i64 14 + %253 = insertelement <128 x i8> %251, i8 %252, i32 126 + %254 = extractelement <16 x i8> %in, i64 15 + %255 = insertelement <128 x i8> %253, i8 %254, i32 127 + %256 = insertelement <16 x i8> poison, i8 %160, i64 0 + %257 = insertelement <16 x i8> %256, i8 %162, i64 1 + %258 = insertelement <16 x i8> %257, i8 %164, i64 2 + %259 = insertelement <16 x i8> %258, i8 %166, i64 3 + %260 = insertelement <16 x i8> %259, i8 %168, i64 4 + %261 = insertelement <16 x i8> %260, i8 %170, i64 5 + %262 = insertelement <16 x i8> %261, i8 %172, i64 6 + %263 = insertelement <16 x i8> %262, i8 %174, i64 7 + %264 = insertelement <16 x i8> %263, i8 %176, i64 8 + %265 = insertelement <16 x i8> %264, i8 %178, i64 9 + %266 = insertelement <16 x i8> %265, i8 %180, i64 10 + %267 = insertelement <16 x i8> %266, i8 %182, i64 11 + %268 = insertelement <16 x i8> %267, i8 %184, i64 12 + %269 = insertelement <16 x i8> %268, i8 %186, i64 13 + %270 = insertelement <16 x i8> %269, i8 %188, i64 14 + %271 = insertelement <16 x i8> %270, i8 %190, i64 15 + %sum = add <16 x i8> %271, %add + store <16 x i8> %sum, ptr addrspace(3) %out, align 16 + ret void +} + +attributes #0 = { "amdgpu-waves-per-eu"="2,2" } diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 11a5a5785a6ec..89f1ca6935cff 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -57,8 +57,13 @@ # so we just exclude llvm-reduce tests from this config altogether. This should # be fine though as profcheck config tests are mostly concerned with opt. config.excludes.append("llvm-reduce") + # Exclude llvm-objcopy tests - not the target of this effort, and some use + # cat in ways that conflict with how profcheck uses it. + config.excludes.append("llvm-objcopy") # (Issue #161235) Temporarily exclude LoopVectorize. config.excludes.append("LoopVectorize") + # exclude UpdateTestChecks - they fail because of inserted prof annotations + config.excludes.append("UpdateTestChecks") # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) @@ -474,7 +479,7 @@ def enable_ptxas(ptxas_executable): config.available_features.add("host-byteorder-" + sys.byteorder + "-endian") if config.target_triple: if re.match( - r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|s390x|s390|tce|thumbeb)-.*", + r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|sparc64|s390x|s390|tce|thumbeb)-.*", config.target_triple, ): config.available_features.add("target-byteorder-big-endian") @@ -579,7 +584,7 @@ def have_cxx_shared_library(): print("could not exec llvm-readobj") return False - readobj_out = readobj_cmd.stdout.read().decode("ascii") + readobj_out = readobj_cmd.stdout.read().decode("utf-8") readobj_cmd.wait() regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)") diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll new file mode 100644 index 0000000000000..292637177591f --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll @@ -0,0 +1,17 @@ +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR + +define i64 @test1(i64 %i) nounwind readnone { + %loc = alloca i64 + %j = load i64, ptr %loc + %r = add i64 %i, %j + ret i64 %r +} + +define i64 @test2(i32 %i) nounwind readnone { + %loc = alloca i32 + %j = load i32, ptr %loc + %r = add i32 %i, %j + %ext = zext i32 %r to i64 + ret i64 %ext +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected new file mode 100644 index 0000000000000..88cb03e85204a --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR + +define i64 @test1(i64 %i) nounwind readnone { +; ASM-LABEL: test1: +; ASM: # %bb.0: +; ASM-NEXT: movq %rdi, %rax +; ASM-NEXT: addq -{{[0-9]+}}(%rsp), %rax +; ASM-NEXT: retq +; MIR-LABEL: name: test1 +; MIR: bb.0 (%ir-block.0): +; MIR-NEXT: liveins: $rdi +; MIR-NEXT: {{ $}} +; MIR-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rdi +; MIR-NEXT: [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s64) from %ir.loc) +; MIR-NEXT: $rax = COPY [[ADD64rm]] +; MIR-NEXT: RET 0, $rax + %loc = alloca i64 + %j = load i64, ptr %loc + %r = add i64 %i, %j + ret i64 %r +} + +define i64 @test2(i32 %i) nounwind readnone { +; ASM-LABEL: test2: +; ASM: # %bb.0: +; ASM-NEXT: movl %edi, %eax +; ASM-NEXT: addl -{{[0-9]+}}(%rsp), %eax +; ASM-NEXT: retq +; MIR-LABEL: name: test2 +; MIR: bb.0 (%ir-block.0): +; MIR-NEXT: liveins: $edi +; MIR-NEXT: {{ $}} +; MIR-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edi +; MIR-NEXT: [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s32) from %ir.loc) +; MIR-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, killed [[ADD32rm]], %subreg.sub_32bit +; MIR-NEXT: $rax = COPY [[SUBREG_TO_REG]] +; MIR-NEXT: RET 0, $rax + %loc = alloca i32 + %j = load i32, ptr %loc + %r = add i32 %i, %j + %ext = zext i32 %r to i64 + ret i64 %ext +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll new file mode 100644 index 0000000000000..7167bcf258e68 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK + +define i32 @add(i32 %a, i32 %b) { + %sum = add i32 %a, %b + ret i32 %sum +} + +define i32 @sub(i32 %a, i32 %b) { + %diff = sub i32 %a, %b + ret i32 %diff +} + diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected new file mode 100644 index 0000000000000..1ba920d1de8b0 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK + +define i32 @add(i32 %a, i32 %b) { + %sum = add i32 %a, %b + ret i32 %sum +} + +define i32 @sub(i32 %a, i32 %b) { + %diff = sub i32 %a, %b + ret i32 %diff +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test new file mode 100644 index 0000000000000..6fc57b583b37d --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test @@ -0,0 +1,9 @@ +# REQUIRES: x86-registered-target +## Test checking that update_llc_test_checks.py can generate both ASM and MIR checks in the same file + +# RUN: cp -f %S/Inputs/x86_asm_mir_mixed.ll %t.ll && %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll + +## Verify that running the script again on an already updated file doesn't add duplicate checks +# RUN: %update_llc_test_checks %t.ll +# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test new file mode 100644 index 0000000000000..bb91a44678f1a --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test @@ -0,0 +1,7 @@ +## Test that using the same prefix for both ASM and MIR outputs generates a warning +## and doesn't produce any checks. + +# RUN: cp -f %S/Inputs/x86_asm_mir_same_prefix.ll %t.ll && %update_llc_test_checks %t.ll 2>&1 | FileCheck %s --check-prefix=WARNING +# RUN: diff -u %S/Inputs/x86_asm_mir_same_prefix.ll.expected %t.ll + +# WARNING: WARNING: The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: CHECK diff --git a/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test new file mode 100644 index 0000000000000..00141f12587d4 --- /dev/null +++ b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test @@ -0,0 +1,33 @@ +# RUN: dsymutil -include-swiftmodules-from-interface -verbose -oso-prepend-path=%p -y -o %t.dSYM %s | FileCheck %s +# +# RUN: dsymutil -include-swiftmodules-from-interface --linker parallel -verbose -oso-prepend-path=%p -y %s -o %t-parallel.dSYM | FileCheck %s +# +# To regenerate: +# echo ''>I.swift +# echo ''>B.swift +# echo 'import I'>main.swift +# xcrun swiftc -emit-module-interface-path I.swiftinterface -enable-library-evolution I.swift +# xcrun swiftc -emit-module-path B.swiftmodule B.swift -Xfrontend -no-serialize-debugging-options +# xcrun swiftc -explicit-module-build main.swift -I. -module-cache-path cache -g -Xfrontend -no-serialize-debugging-options +# output is "B.swiftmodule" and "cache/I*.swiftmodule" +# +# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/Binary.swiftmodule +# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/FromInterface.swiftmodule + +# +--- +triple: 'arm64-apple-darwin' +objects: + - filename: '../Inputs/Binary.swiftmodule' + timestamp: 0 + type: 50 + symbols: [] + - filename: '../Inputs/FromInterface.swiftmodule' + timestamp: 0 + type: 50 + symbols: [] + - filename: '../Inputs/FromInterface.swiftmodule' + timestamp: 0 + type: 50 + symbols: [] +... diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test index 1574fe35f5254..0b0bce194d575 100644 --- a/llvm/test/tools/dsymutil/cmdline.test +++ b/llvm/test/tools/dsymutil/cmdline.test @@ -14,6 +14,7 @@ CHECK: -fat64 CHECK: -flat CHECK: -gen-reproducer CHECK: -help +CHECK: -include-swiftmodules-from-interface CHECK: -keep-function-for-static CHECK: -no-object-timestamp CHECK: -no-odr diff --git a/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml new file mode 100644 index 0000000000000..2a8c37da80e64 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml @@ -0,0 +1,136 @@ +## Tests the --filter-child-tag (-t) option. + +# RUN: yaml2obj %s -o %t.o + +# RUN: llvm-dwarfdump %t.o --filter-child-tag=DW_TAG_structure_type | FileCheck %s --check-prefix=ONLY_STRUCT + +# ONLY_STRUCT: DW_TAG_compile_unit +# ONLY_STRUCT-NOT: DW_TAG_namespace +# ONLY_STRUCT-NOT: DW_TAG_structure_type + +# RUN: llvm-dwarfdump %t.o -t DW_TAG_structure_type -t DW_TAG_namespace | \ +# RUN: FileCheck %s --check-prefix=STRUCT_AND_NS --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_member + +# STRUCT_AND_NS: DW_TAG_compile_unit +# STRUCT_AND_NS: DW_TAG_namespace +# STRUCT_AND_NS: DW_TAG_structure_type +# STRUCT_AND_NS: DW_TAG_structure_type + +# RUN: llvm-dwarfdump %t.o -c --name=Foo -t DW_TAG_member | \ +# RUN: FileCheck %s --check-prefix=FOO_MEM --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace + +# FOO_MEM: DW_TAG_structure_type +# FOO_MEM: DW_TAG_member +# FOO_MEM: DW_TAG_member +# FOO_MEM: DW_TAG_member +# FOO_MEM-NOT: DW_TAG_structure_type +# FOO_MEM-NOT: DW_TAG_member + +# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag -t DW_TAG_member | \ +# RUN: FileCheck %s --check-prefix=SINGLE_INVALID_TAG --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace + +# SINGLE_INVALID_TAG: DW_TAG_structure_type +# SINGLE_INVALID_TAG: DW_TAG_member +# SINGLE_INVALID_TAG: DW_TAG_member +# SINGLE_INVALID_TAG: DW_TAG_member +# SINGLE_INVALID_TAG-NOT: DW_TAG_structure_type +# SINGLE_INVALID_TAG-NOT: DW_TAG_member + +# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag | \ +# RUN: FileCheck %s --check-prefix=ONLY_INVALID_TAGS --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace --implicit-check-not=DW_TAG_member + +# ONLY_INVALID_TAGS: DW_TAG_structure_type +# ONLY_INVALID_TAGS-NOT: DW_TAG_structure_type + +# RUN: llvm-dwarfdump %t.o -c -p --name=Foo -t DW_TAG_member | \ +# RUN: FileCheck %s --check-prefix=FOO_MEM_WITH_PARENT --implicit-check-not=DW_TAG_subprogram + +# FOO_MEM_WITH_PARENT: DW_TAG_compile_unit +# FOO_MEM_WITH_PARENT: DW_TAG_namespace +# FOO_MEM_WITH_PARENT: DW_TAG_structure_type +# FOO_MEM_WITH_PARENT: DW_TAG_member +# FOO_MEM_WITH_PARENT: DW_TAG_member +# FOO_MEM_WITH_PARENT: DW_TAG_member +# FOO_MEM_WITH_PARENT-NOT: DW_TAG_structure_type +# FOO_MEM_WITH_PARENT-NOT: DW_TAG_member + +## Not specifying --show-children ignores the --filter-child-tag option. +# RUN: llvm-dwarfdump %t.o --name=Foo -t DW_TAG_member 2>&1 | FileCheck %s --check-prefix=NO_SHOW_CHILDREN + +# NO_SHOW_CHILDREN: DW_TAG_structure_type + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +DWARF: + debug_abbrev: + - Table: + - Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_producer + Form: DW_FORM_string + - Tag: DW_TAG_namespace + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_structure_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_member + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + - Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_string + debug_info: + - Version: 5 + UnitType: DW_UT_compile + Entries: + - AbbrCode: 1 + Values: + - CStr: handwritten + - AbbrCode: 2 + Values: + - CStr: ns + - AbbrCode: 3 + Values: + - CStr: Foo + - AbbrCode: 4 + Values: + - CStr: mem1 + - AbbrCode: 4 + Values: + - CStr: mem2 + - AbbrCode: 4 + Values: + - CStr: mem3 + - AbbrCode: 3 + Values: + - CStr: NestedInFoo + - AbbrCode: 4 + Values: + - CStr: NestedMem1 + - AbbrCode: 4 + Values: + - CStr: NestedMem2 + - AbbrCode: 5 + Values: + - CStr: NestedFunc + - AbbrCode: 0x0 + - AbbrCode: 5 + Values: + - CStr: FooFunc + - AbbrCode: 0x0 + - AbbrCode: 0x0 + - AbbrCode: 0x0 diff --git a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll index a6ace2246fa8e..b800f9aa97c8f 100644 --- a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll +++ b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll @@ -26,7 +26,7 @@ define i32 @t(i32 %a) { ; CHECK-ALL: declare i32 @llvm.uadd.sat.i32(i32, i32) #0 declare i32 @llvm.uadd.sat.i32(i32, i32) #0 -; CHECK-ALL: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK-ALL: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK-INTERESTINGNESS: attributes #1 = { ; CHECK-INTERESTINGNESS-SAME: "arg4" diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp index b91c27e6a0f86..ee1e9060657b0 100644 --- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp +++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp @@ -794,9 +794,10 @@ bool DwarfLinkerForBinary::linkImpl( reportWarning("Could not parse binary Swift module: " + toString(FromInterfaceOrErr.takeError()), Obj->getObjectFilename()); - // Only skip swiftmodules that could be parsed and are - // positively identified as textual. - } else if (*FromInterfaceOrErr) { + // Only skip swiftmodules that could be parsed and are positively + // identified as textual. Do so only when the option allows. + } else if (*FromInterfaceOrErr && + !Options.IncludeSwiftModulesFromInterface) { if (Options.Verbose) outs() << "Skipping compiled textual Swift interface: " << Obj->getObjectFilename() << "\n"; diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h index ad5515a04333e..c333a3d4afee0 100644 --- a/llvm/tools/dsymutil/LinkUtils.h +++ b/llvm/tools/dsymutil/LinkUtils.h @@ -114,6 +114,13 @@ struct LinkOptions { /// Whether all remarks should be kept or only remarks with valid debug /// locations. bool RemarksKeepAll = true; + + /// Whether or not to copy binary swiftmodules built from textual + /// .swiftinterface files into the dSYM bundle. These typically come only + /// from the SDK (since textual interfaces require library evolution) and + /// thus are a waste of space to copy into the bundle. Turn this on if the + /// swiftmodules are different from those in the SDK. + bool IncludeSwiftModulesFromInterface = false; /// @} LinkOptions() = default; diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td index ad35e55e33b12..e99bc12fa7fd8 100644 --- a/llvm/tools/dsymutil/Options.td +++ b/llvm/tools/dsymutil/Options.td @@ -202,6 +202,14 @@ def remarks_drop_without_debug: Flag<["--", "-"], "remarks-drop-without-debug">, "all remarks are kept.">, Group<grp_general>; +def include_swiftmodules_from_interface: Flag<["--", "-"], "include-swiftmodules-from-interface">, + HelpText<"Whether or not to copy binary swiftmodules built from textual " + ".swiftinterface files into the dSYM bundle. These typically come only " + "from the SDK (since textual interfaces require library evolution) and " + "thus are a waste of space to copy into the bundle. Turn this on if the " + "swiftmodules are different from those in the SDK.">, + Group<grp_general>; + def linker: Separate<["--", "-"], "linker">, MetaVarName<"<DWARF linker type>">, HelpText<"Specify the desired type of DWARF linker. Defaults to 'classic'">, diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp index 913077eb0b06d..688f6aaf3d0c9 100644 --- a/llvm/tools/dsymutil/dsymutil.cpp +++ b/llvm/tools/dsymutil/dsymutil.cpp @@ -391,6 +391,9 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) { Options.LinkOpts.RemarksKeepAll = !Args.hasArg(OPT_remarks_drop_without_debug); + Options.LinkOpts.IncludeSwiftModulesFromInterface = + Args.hasArg(OPT_include_swiftmodules_from_interface); + if (opt::Arg *BuildVariantSuffix = Args.getLastArg(OPT_build_variant_suffix)) Options.LinkOpts.BuildVariantSuffix = BuildVariantSuffix->getValue(); diff --git a/llvm/tools/llvm-dwarfdump/CMakeLists.txt b/llvm/tools/llvm-dwarfdump/CMakeLists.txt index aeb1b8f14d830..7a0adf32e938c 100644 --- a/llvm/tools/llvm-dwarfdump/CMakeLists.txt +++ b/llvm/tools/llvm-dwarfdump/CMakeLists.txt @@ -1,4 +1,5 @@ set(LLVM_LINK_COMPONENTS + BinaryFormat DebugInfoDWARF DebugInfoDWARFLowLevel AllTargetsDescs diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp index 11eb58ea911df..6f120f93700f6 100644 --- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp +++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVectorExtras.h" #include "llvm/ADT/StringSet.h" #include "llvm/DebugInfo/DIContext.h" #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h" @@ -242,6 +243,15 @@ static opt<bool> cat(DwarfDumpCategory)); static alias ShowParentsAlias("p", desc("Alias for --show-parents."), aliasopt(ShowParents), cl::NotHidden); + +static list<std::string> FilterChildTag( + "filter-child-tag", + desc("When --show-children is specified, show only DIEs with the " + "specified DWARF tags."), + value_desc("list of DWARF tags"), cat(DwarfDumpCategory)); +static alias FilterChildTagAlias("t", desc("Alias for --filter-child-tag."), + aliasopt(FilterChildTag), cl::NotHidden); + static opt<bool> ShowForm("show-form", desc("Show DWARF form types after the DWARF attribute types."), @@ -330,6 +340,13 @@ static cl::extrahelp /// @} //===----------------------------------------------------------------------===// +static llvm::SmallVector<unsigned> +makeTagVector(const list<std::string> &TagStrings) { + return llvm::map_to_vector(TagStrings, [](const std::string &Tag) { + return llvm::dwarf::getTag(Tag); + }); +} + static void error(Error Err) { if (!Err) return; @@ -356,6 +373,7 @@ static DIDumpOptions getDumpOpts(DWARFContext &C) { DumpOpts.ShowAddresses = !Diff; DumpOpts.ShowChildren = ShowChildren; DumpOpts.ShowParents = ShowParents; + DumpOpts.FilterChildTag = makeTagVector(FilterChildTag); DumpOpts.ShowForm = ShowForm; DumpOpts.SummarizeTypes = SummarizeTypes; DumpOpts.Verbose = Verbose; diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 79216e89c7cba..88d6daf08d35e 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -776,6 +776,7 @@ createSharedMemoryManager(SimpleRemoteEPC &SREPC) { SlabSize, SREPC, SAs); } +#if LLVM_ON_UNIX && LLVM_ENABLE_THREADS static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) { switch (UseMemMgr) { case MemMgr::Default: @@ -789,6 +790,7 @@ static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) { break; } } +#endif static Expected<MaterializationUnit::Interface> getTestObjectFileInterface(Session &S, MemoryBufferRef O) { diff --git a/llvm/tools/opt-viewer/optrecord.py b/llvm/tools/opt-viewer/optrecord.py index b9244fd1ae739..07e21028535c4 100644 --- a/llvm/tools/opt-viewer/optrecord.py +++ b/llvm/tools/opt-viewer/optrecord.py @@ -344,6 +344,8 @@ def find_opt_files(*dirs_or_files): d for d in subdirs if not os.path.ismount(os.path.join(dir, d)) ] for file in files: - if fnmatch.fnmatch(file, "*.opt.yaml*"): + if fnmatch.fnmatch(file, "*.opt.yaml*") or fnmatch.fnmatch( + file, "*.opt.ld.yaml*" + ): all.append(os.path.join(dir, file)) return all diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp index fbe96bb127836..99cc38b6b422b 100644 --- a/llvm/unittests/ADT/APFloatTest.cpp +++ b/llvm/unittests/ADT/APFloatTest.cpp @@ -10118,7 +10118,7 @@ TEST(APFloatTest, Float4E2M1FNToFloat) { } TEST(APFloatTest, AddOrSubtractSignificand) { - typedef detail::IEEEFloatUnitTestHelper Helper; + using Helper = detail::IEEEFloatUnitTestHelper; // Test cases are all combinations of: // {equal exponents, LHS larger exponent, RHS larger exponent} // {equal significands, LHS larger significand, RHS larger significand} diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp index 12ba0041af551..e13523b8e10c3 100644 --- a/llvm/unittests/ADT/BitVectorTest.cpp +++ b/llvm/unittests/ADT/BitVectorTest.cpp @@ -21,7 +21,7 @@ template <typename T> class BitVectorTest : public ::testing::Test { }; // Test both BitVector and SmallBitVector with the same suite of tests. -typedef ::testing::Types<BitVector, SmallBitVector> BitVectorTestTypes; +using BitVectorTestTypes = ::testing::Types<BitVector, SmallBitVector>; TYPED_TEST_SUITE(BitVectorTest, BitVectorTestTypes, ); TYPED_TEST(BitVectorTest, TrivialOperation) { @@ -857,7 +857,7 @@ TYPED_TEST(BitVectorTest, BinOps) { EXPECT_FALSE(B.anyCommon(A)); } -typedef std::vector<std::pair<int, int>> RangeList; +using RangeList = std::vector<std::pair<int, int>>; template <typename VecType> static inline VecType createBitVector(uint32_t Size, diff --git a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp index a737390e79d8d..571e4d27c6752 100644 --- a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp +++ b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp @@ -21,7 +21,7 @@ using namespace llvm; namespace llvm { TEST(BreadthFristIteratorTest, Basic) { - typedef bf_iterator<Graph<4>> BFIter; + using BFIter = bf_iterator<Graph<4>>; Graph<4> G; G.AddEdge(0, 1); @@ -46,7 +46,7 @@ TEST(BreadthFristIteratorTest, Basic) { } TEST(BreadthFristIteratorTest, Cycle) { - typedef bf_iterator<Graph<4>> BFIter; + using BFIter = bf_iterator<Graph<4>>; Graph<4> G; G.AddEdge(0, 1); diff --git a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp index f543947899393..918a2e63da935 100644 --- a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp +++ b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp @@ -15,7 +15,7 @@ using namespace llvm; namespace { -typedef DAGDeltaAlgorithm::edge_ty edge_ty; +using edge_ty = DAGDeltaAlgorithm::edge_ty; class FixedDAGDeltaAlgorithm : public DAGDeltaAlgorithm { changeset_ty FailingSet; diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp index aceb4f30d878d..273ee09fc1e28 100644 --- a/llvm/unittests/ADT/DenseMapTest.cpp +++ b/llvm/unittests/ADT/DenseMapTest.cpp @@ -129,18 +129,17 @@ typename T::mapped_type *const DenseMapTest<T>::dummy_value_ptr = nullptr; // Register these types for testing. // clang-format off -typedef ::testing::Types<DenseMap<uint32_t, uint32_t>, - DenseMap<uint32_t *, uint32_t *>, - DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>, - DenseMap<EnumClass, uint32_t>, - DenseMap<std::optional<uint32_t>, uint32_t>, - SmallDenseMap<uint32_t, uint32_t>, - SmallDenseMap<uint32_t *, uint32_t *>, - SmallDenseMap<CtorTester, CtorTester, 4, - CtorTesterMapInfo>, - SmallDenseMap<EnumClass, uint32_t>, - SmallDenseMap<std::optional<uint32_t>, uint32_t> - > DenseMapTestTypes; +using DenseMapTestTypes = ::testing::Types< + DenseMap<uint32_t, uint32_t>, + DenseMap<uint32_t *, uint32_t *>, + DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>, + DenseMap<EnumClass, uint32_t>, + DenseMap<std::optional<uint32_t>, uint32_t>, + SmallDenseMap<uint32_t, uint32_t>, + SmallDenseMap<uint32_t *, uint32_t *>, + SmallDenseMap<CtorTester, CtorTester, 4, CtorTesterMapInfo>, + SmallDenseMap<EnumClass, uint32_t>, + SmallDenseMap<std::optional<uint32_t>, uint32_t>>; // clang-format on TYPED_TEST_SUITE(DenseMapTest, DenseMapTestTypes, ); diff --git a/llvm/unittests/ADT/DenseSetTest.cpp b/llvm/unittests/ADT/DenseSetTest.cpp index a24f99b6bb34f..a2a062b151b67 100644 --- a/llvm/unittests/ADT/DenseSetTest.cpp +++ b/llvm/unittests/ADT/DenseSetTest.cpp @@ -96,13 +96,13 @@ template <typename T> class DenseSetTest : public testing::Test { }; // Register these types for testing. -typedef ::testing::Types<DenseSet<unsigned, TestDenseSetInfo>, - const DenseSet<unsigned, TestDenseSetInfo>, - SmallDenseSet<unsigned, 1, TestDenseSetInfo>, - SmallDenseSet<unsigned, 4, TestDenseSetInfo>, - const SmallDenseSet<unsigned, 4, TestDenseSetInfo>, - SmallDenseSet<unsigned, 64, TestDenseSetInfo>> - DenseSetTestTypes; +using DenseSetTestTypes = + ::testing::Types<DenseSet<unsigned, TestDenseSetInfo>, + const DenseSet<unsigned, TestDenseSetInfo>, + SmallDenseSet<unsigned, 1, TestDenseSetInfo>, + SmallDenseSet<unsigned, 4, TestDenseSetInfo>, + const SmallDenseSet<unsigned, 4, TestDenseSetInfo>, + SmallDenseSet<unsigned, 64, TestDenseSetInfo>>; TYPED_TEST_SUITE(DenseSetTest, DenseSetTestTypes, ); TYPED_TEST(DenseSetTest, Constructor) { diff --git a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp index f792878004e7a..00312ca6044e6 100644 --- a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp +++ b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp @@ -21,7 +21,7 @@ using namespace llvm; namespace llvm { template <typename T> struct CountedSet { - typedef typename SmallPtrSet<T, 4>::iterator iterator; + using iterator = typename SmallPtrSet<T, 4>::iterator; SmallPtrSet<T, 4> S; int InsertVisited = 0; @@ -44,8 +44,8 @@ template <typename T> class df_iterator_storage<CountedSet<T>, true> { }; TEST(DepthFirstIteratorTest, ActuallyUpdateIterator) { - typedef CountedSet<Graph<3>::NodeType *> StorageT; - typedef df_iterator<Graph<3>, StorageT, true> DFIter; + using StorageT = CountedSet<Graph<3>::NodeType *>; + using DFIter = df_iterator<Graph<3>, StorageT, true>; Graph<3> G; G.AddEdge(0, 1); diff --git a/llvm/unittests/ADT/FallibleIteratorTest.cpp b/llvm/unittests/ADT/FallibleIteratorTest.cpp index d3389744ffbfe..c17aa0393dfcb 100644 --- a/llvm/unittests/ADT/FallibleIteratorTest.cpp +++ b/llvm/unittests/ADT/FallibleIteratorTest.cpp @@ -19,8 +19,8 @@ using namespace llvm; namespace { -using ItemValid = enum { ValidItem, InvalidItem }; -using LinkValid = enum { ValidLink, InvalidLink }; +enum ItemValid { ValidItem, InvalidItem }; +enum LinkValid { ValidLink, InvalidLink }; class Item { public: diff --git a/llvm/unittests/ADT/IListBaseTest.cpp b/llvm/unittests/ADT/IListBaseTest.cpp index bd915688b190d..eeed488c28d88 100644 --- a/llvm/unittests/ADT/IListBaseTest.cpp +++ b/llvm/unittests/ADT/IListBaseTest.cpp @@ -19,13 +19,14 @@ template <typename T> class IListBaseTest : public ::testing::Test {}; class Parent; // Test variants with the same test. -typedef ::testing::Types<ilist_base<false, void>, ilist_base<true, void>, ilist_base<false, Parent*>, ilist_base<true, Parent*>> - IListBaseTestTypes; +using IListBaseTestTypes = + ::testing::Types<ilist_base<false, void>, ilist_base<true, void>, + ilist_base<false, Parent *>, ilist_base<true, Parent *>>; TYPED_TEST_SUITE(IListBaseTest, IListBaseTestTypes, ); TYPED_TEST(IListBaseTest, insertBeforeImpl) { - typedef TypeParam list_base_type; - typedef typename list_base_type::node_base_type node_base_type; + using list_base_type = TypeParam; + using node_base_type = typename list_base_type::node_base_type; node_base_type S, A, B; @@ -51,8 +52,8 @@ TYPED_TEST(IListBaseTest, insertBeforeImpl) { } TYPED_TEST(IListBaseTest, removeImpl) { - typedef TypeParam list_base_type; - typedef typename list_base_type::node_base_type node_base_type; + using list_base_type = TypeParam; + using node_base_type = typename list_base_type::node_base_type; node_base_type S, A, B; @@ -80,8 +81,8 @@ TYPED_TEST(IListBaseTest, removeImpl) { } TYPED_TEST(IListBaseTest, removeRangeImpl) { - typedef TypeParam list_base_type; - typedef typename list_base_type::node_base_type node_base_type; + using list_base_type = TypeParam; + using node_base_type = typename list_base_type::node_base_type; node_base_type S, A, B, C, D; @@ -106,8 +107,8 @@ TYPED_TEST(IListBaseTest, removeRangeImpl) { } TYPED_TEST(IListBaseTest, removeRangeImplAllButSentinel) { - typedef TypeParam list_base_type; - typedef typename list_base_type::node_base_type node_base_type; + using list_base_type = TypeParam; + using node_base_type = typename list_base_type::node_base_type; node_base_type S, A, B; @@ -126,8 +127,8 @@ TYPED_TEST(IListBaseTest, removeRangeImplAllButSentinel) { } TYPED_TEST(IListBaseTest, transferBeforeImpl) { - typedef TypeParam list_base_type; - typedef typename list_base_type::node_base_type node_base_type; + using list_base_type = TypeParam; + using node_base_type = typename list_base_type::node_base_type; node_base_type S1, S2, A, B, C, D, E; diff --git a/llvm/unittests/ADT/IListIteratorTest.cpp b/llvm/unittests/ADT/IListIteratorTest.cpp index 4e5b847b35ffe..54a4258246e9b 100644 --- a/llvm/unittests/ADT/IListIteratorTest.cpp +++ b/llvm/unittests/ADT/IListIteratorTest.cpp @@ -141,10 +141,10 @@ TEST(IListIteratorTest, ReverseConstructor) { L.insert(L.end(), B); // Save typing. - typedef simple_ilist<Node>::iterator iterator; - typedef simple_ilist<Node>::reverse_iterator reverse_iterator; - typedef simple_ilist<Node>::const_iterator const_iterator; - typedef simple_ilist<Node>::const_reverse_iterator const_reverse_iterator; + using iterator = simple_ilist<Node>::iterator; + using reverse_iterator = simple_ilist<Node>::reverse_iterator; + using const_iterator = simple_ilist<Node>::const_iterator; + using const_reverse_iterator = simple_ilist<Node>::const_reverse_iterator; // Check conversion values. EXPECT_EQ(L.begin(), iterator(L.rend())); diff --git a/llvm/unittests/ADT/IListNodeBaseTest.cpp b/llvm/unittests/ADT/IListNodeBaseTest.cpp index ef90c716a4118..393f83af99b76 100644 --- a/llvm/unittests/ADT/IListNodeBaseTest.cpp +++ b/llvm/unittests/ADT/IListNodeBaseTest.cpp @@ -17,10 +17,10 @@ namespace { class Parent {}; -typedef ilist_node_base<false, void> RawNode; -typedef ilist_node_base<true, void> TrackingNode; -typedef ilist_node_base<false, Parent> ParentNode; -typedef ilist_node_base<true, Parent> ParentTrackingNode; +using RawNode = ilist_node_base<false, void>; +using TrackingNode = ilist_node_base<true, void>; +using ParentNode = ilist_node_base<false, Parent>; +using ParentTrackingNode = ilist_node_base<true, Parent>; TEST(IListNodeBaseTest, DefaultConstructor) { RawNode A; diff --git a/llvm/unittests/ADT/IListSentinelTest.cpp b/llvm/unittests/ADT/IListSentinelTest.cpp index 1f4a8311370a6..709a1a4bb90e7 100644 --- a/llvm/unittests/ADT/IListSentinelTest.cpp +++ b/llvm/unittests/ADT/IListSentinelTest.cpp @@ -14,18 +14,17 @@ using namespace llvm; namespace { template <class T, class... Options> struct PickSentinel { - typedef ilist_sentinel< - typename ilist_detail::compute_node_options<T, Options...>::type> - type; + using type = ilist_sentinel< + typename ilist_detail::compute_node_options<T, Options...>::type>; }; class Node : public ilist_node<Node> {}; class TrackingNode : public ilist_node<Node, ilist_sentinel_tracking<true>> {}; -typedef PickSentinel<Node>::type Sentinel; -typedef PickSentinel<Node, ilist_sentinel_tracking<true>>::type - TrackingSentinel; -typedef PickSentinel<Node, ilist_sentinel_tracking<false>>::type - NoTrackingSentinel; +using Sentinel = PickSentinel<Node>::type; +using TrackingSentinel = + PickSentinel<Node, ilist_sentinel_tracking<true>>::type; +using NoTrackingSentinel = + PickSentinel<Node, ilist_sentinel_tracking<false>>::type; struct LocalAccess : ilist_detail::NodeAccess { using NodeAccess::getPrev; diff --git a/llvm/unittests/ADT/IntervalMapTest.cpp b/llvm/unittests/ADT/IntervalMapTest.cpp index 99a93ab198d89..38f397ff2eb54 100644 --- a/llvm/unittests/ADT/IntervalMapTest.cpp +++ b/llvm/unittests/ADT/IntervalMapTest.cpp @@ -14,9 +14,9 @@ using namespace llvm; namespace { -typedef IntervalMap<unsigned, unsigned, 4> UUMap; -typedef IntervalMap<unsigned, unsigned, 4, - IntervalMapHalfOpenInfo<unsigned>> UUHalfOpenMap; +using UUMap = IntervalMap<unsigned, unsigned, 4>; +using UUHalfOpenMap = + IntervalMap<unsigned, unsigned, 4, IntervalMapHalfOpenInfo<unsigned>>; // Empty map tests TEST(IntervalMapTest, EmptyMap) { @@ -713,7 +713,7 @@ TEST(IntervalMapTest, OverlapsHalfOpen) { } TEST(IntervalMapOverlapsTest, SmallMaps) { - typedef IntervalMapOverlaps<UUMap,UUMap> UUOverlaps; + using UUOverlaps = IntervalMapOverlaps<UUMap, UUMap>; UUMap::Allocator allocator; UUMap mapA(allocator); UUMap mapB(allocator); @@ -757,7 +757,7 @@ TEST(IntervalMapOverlapsTest, SmallMaps) { } TEST(IntervalMapOverlapsTest, BigMaps) { - typedef IntervalMapOverlaps<UUMap,UUMap> UUOverlaps; + using UUOverlaps = IntervalMapOverlaps<UUMap, UUMap>; UUMap::Allocator allocator; UUMap mapA(allocator); UUMap mapB(allocator); diff --git a/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp b/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp index f4f2083482804..6da42271764bc 100644 --- a/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp +++ b/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp @@ -25,9 +25,9 @@ struct SimpleRefCounted : Base<SimpleRefCounted<Base>> { template <typename T> struct IntrusiveRefCntPtrTest : testing::Test {}; -typedef ::testing::Types<SimpleRefCounted<RefCountedBase>, - SimpleRefCounted<ThreadSafeRefCountedBase>> - IntrusiveRefCntTypes; +using IntrusiveRefCntTypes = + ::testing::Types<SimpleRefCounted<RefCountedBase>, + SimpleRefCounted<ThreadSafeRefCountedBase>>; TYPED_TEST_SUITE(IntrusiveRefCntPtrTest, IntrusiveRefCntTypes, ); TYPED_TEST(IntrusiveRefCntPtrTest, RefCountedBaseCopyDoesNotLeak) { diff --git a/llvm/unittests/ADT/IteratorTest.cpp b/llvm/unittests/ADT/IteratorTest.cpp index b5d63efd8ccba..9dd8c1a84f44a 100644 --- a/llvm/unittests/ADT/IteratorTest.cpp +++ b/llvm/unittests/ADT/IteratorTest.cpp @@ -177,8 +177,8 @@ TEST(PointeeIteratorTest, Basic) { V.push_back(&arr[2]); V.push_back(&arr[3]); - typedef pointee_iterator<SmallVectorImpl<int *>::const_iterator> - test_iterator; + using test_iterator = + pointee_iterator<SmallVectorImpl<int *>::const_iterator>; test_iterator Begin, End; Begin = V.begin(); @@ -218,9 +218,8 @@ TEST(PointeeIteratorTest, SmartPointer) { V.push_back(std::make_unique<int>(3)); V.push_back(std::make_unique<int>(4)); - typedef pointee_iterator< - SmallVectorImpl<std::unique_ptr<int>>::const_iterator> - test_iterator; + using test_iterator = + pointee_iterator<SmallVectorImpl<std::unique_ptr<int>>::const_iterator>; test_iterator Begin, End; Begin = V.begin(); diff --git a/llvm/unittests/ADT/PointerSumTypeTest.cpp b/llvm/unittests/ADT/PointerSumTypeTest.cpp index fbf59f3a2fda5..11e657ad8bd25 100644 --- a/llvm/unittests/ADT/PointerSumTypeTest.cpp +++ b/llvm/unittests/ADT/PointerSumTypeTest.cpp @@ -17,10 +17,9 @@ struct PointerSumTypeTest : public testing::Test { float f; int i1, i2; - typedef PointerSumType<Kinds, PointerSumTypeMember<Float, float *>, - PointerSumTypeMember<Int1, int *>, - PointerSumTypeMember<Int2, int *>> - SumType; + using SumType = PointerSumType<Kinds, PointerSumTypeMember<Float, float *>, + PointerSumTypeMember<Int1, int *>, + PointerSumTypeMember<Int2, int *>>; SumType a, b, c, n; PointerSumTypeTest() diff --git a/llvm/unittests/ADT/PointerUnionTest.cpp b/llvm/unittests/ADT/PointerUnionTest.cpp index acddb78960149..d8ac3aed76da2 100644 --- a/llvm/unittests/ADT/PointerUnionTest.cpp +++ b/llvm/unittests/ADT/PointerUnionTest.cpp @@ -12,9 +12,9 @@ using namespace llvm; namespace { -typedef PointerUnion<int *, float *> PU; -typedef PointerUnion<int *, float *, long long *> PU3; -typedef PointerUnion<int *, float *, long long *, double *> PU4; +using PU = PointerUnion<int *, float *>; +using PU3 = PointerUnion<int *, float *, long long *>; +using PU4 = PointerUnion<int *, float *, long long *, double *>; struct PointerUnionTest : public testing::Test { float f; @@ -116,9 +116,9 @@ TEST_F(PointerUnionTest, Get) { template<int I> struct alignas(8) Aligned {}; -typedef PointerUnion<Aligned<0> *, Aligned<1> *, Aligned<2> *, Aligned<3> *, - Aligned<4> *, Aligned<5> *, Aligned<6> *, Aligned<7> *> - PU8; +using PU8 = + PointerUnion<Aligned<0> *, Aligned<1> *, Aligned<2> *, Aligned<3> *, + Aligned<4> *, Aligned<5> *, Aligned<6> *, Aligned<7> *>; TEST_F(PointerUnionTest, ManyElements) { Aligned<0> a0; diff --git a/llvm/unittests/ADT/PostOrderIteratorTest.cpp b/llvm/unittests/ADT/PostOrderIteratorTest.cpp index 838481f76ed7f..e875dd63a1958 100644 --- a/llvm/unittests/ADT/PostOrderIteratorTest.cpp +++ b/llvm/unittests/ADT/PostOrderIteratorTest.cpp @@ -23,7 +23,7 @@ namespace { // Whether we're able to compile TEST(PostOrderIteratorTest, Compiles) { - typedef SmallPtrSet<void *, 4> ExtSetTy; + using ExtSetTy = SmallPtrSet<void *, 4>; // Tests that template specializations are kept up to date void *Null = nullptr; diff --git a/llvm/unittests/ADT/PriorityWorklistTest.cpp b/llvm/unittests/ADT/PriorityWorklistTest.cpp index f12d32ac9f496..08a47736c392e 100644 --- a/llvm/unittests/ADT/PriorityWorklistTest.cpp +++ b/llvm/unittests/ADT/PriorityWorklistTest.cpp @@ -20,8 +20,8 @@ namespace { using namespace llvm; template <typename T> class PriorityWorklistTest : public ::testing::Test {}; -typedef ::testing::Types<PriorityWorklist<int>, SmallPriorityWorklist<int, 2>> - TestTypes; +using TestTypes = + ::testing::Types<PriorityWorklist<int>, SmallPriorityWorklist<int, 2>>; TYPED_TEST_SUITE(PriorityWorklistTest, TestTypes, ); TYPED_TEST(PriorityWorklistTest, Basic) { diff --git a/llvm/unittests/ADT/RangeAdapterTest.cpp b/llvm/unittests/ADT/RangeAdapterTest.cpp index c1a8a984f233b..6849ccbc8052d 100644 --- a/llvm/unittests/ADT/RangeAdapterTest.cpp +++ b/llvm/unittests/ADT/RangeAdapterTest.cpp @@ -24,8 +24,8 @@ class ReverseOnlyVector { public: ReverseOnlyVector(std::initializer_list<int> list) : Vec(list) {} - typedef std::vector<int>::reverse_iterator reverse_iterator; - typedef std::vector<int>::const_reverse_iterator const_reverse_iterator; + using reverse_iterator = std::vector<int>::reverse_iterator; + using const_reverse_iterator = std::vector<int>::const_reverse_iterator; reverse_iterator rbegin() { return Vec.rbegin(); } reverse_iterator rend() { return Vec.rend(); } const_reverse_iterator rbegin() const { return Vec.rbegin(); } @@ -41,11 +41,11 @@ class BidirectionalVector { public: BidirectionalVector(std::initializer_list<int> list) : Vec(list) {} - typedef std::vector<int>::iterator iterator; + using iterator = std::vector<int>::iterator; iterator begin() const; iterator end() const; - typedef std::vector<int>::reverse_iterator reverse_iterator; + using reverse_iterator = std::vector<int>::reverse_iterator; reverse_iterator rbegin() const { return Vec.rbegin(); } reverse_iterator rend() const { return Vec.rend(); } }; @@ -58,15 +58,15 @@ class BidirectionalVectorConsts { public: BidirectionalVectorConsts(std::initializer_list<int> list) : Vec(list) {} - typedef std::vector<int>::iterator iterator; - typedef std::vector<int>::const_iterator const_iterator; + using iterator = std::vector<int>::iterator; + using const_iterator = std::vector<int>::const_iterator; iterator begin(); iterator end(); const_iterator begin() const; const_iterator end() const; - typedef std::vector<int>::reverse_iterator reverse_iterator; - typedef std::vector<int>::const_reverse_iterator const_reverse_iterator; + using reverse_iterator = std::vector<int>::reverse_iterator; + using const_reverse_iterator = std::vector<int>::const_reverse_iterator; reverse_iterator rbegin() { return Vec.rbegin(); } reverse_iterator rend() { return Vec.rend(); } const_reverse_iterator rbegin() const { return Vec.rbegin(); } @@ -80,7 +80,7 @@ class CustomIteratorVector { public: CustomIteratorVector(std::initializer_list<int> list) : V(list) {} - typedef std::vector<int>::iterator iterator; + using iterator = std::vector<int>::iterator; class reverse_iterator { std::vector<int>::iterator I; @@ -126,8 +126,8 @@ template <typename R> void TestRev(const R &r) { // Test fixture template <typename T> class RangeAdapterLValueTest : public ::testing::Test {}; -typedef ::testing::Types<std::vector<int>, std::list<int>, int[4]> - RangeAdapterLValueTestTypes; +using RangeAdapterLValueTestTypes = + ::testing::Types<std::vector<int>, std::list<int>, int[4]>; TYPED_TEST_SUITE(RangeAdapterLValueTest, RangeAdapterLValueTestTypes, ); TYPED_TEST(RangeAdapterLValueTest, TrivialOperation) { @@ -140,10 +140,10 @@ TYPED_TEST(RangeAdapterLValueTest, TrivialOperation) { template <typename T> struct RangeAdapterRValueTest : testing::Test {}; -typedef ::testing::Types<std::vector<int>, std::list<int>, CustomIteratorVector, - ReverseOnlyVector, BidirectionalVector, - BidirectionalVectorConsts> - RangeAdapterRValueTestTypes; +using RangeAdapterRValueTestTypes = + ::testing::Types<std::vector<int>, std::list<int>, CustomIteratorVector, + ReverseOnlyVector, BidirectionalVector, + BidirectionalVectorConsts>; TYPED_TEST_SUITE(RangeAdapterRValueTest, RangeAdapterRValueTestTypes, ); TYPED_TEST(RangeAdapterRValueTest, TrivialOperation) { diff --git a/llvm/unittests/ADT/SCCIteratorTest.cpp b/llvm/unittests/ADT/SCCIteratorTest.cpp index 48350959d046b..5f088294b1a2d 100644 --- a/llvm/unittests/ADT/SCCIteratorTest.cpp +++ b/llvm/unittests/ADT/SCCIteratorTest.cpp @@ -21,7 +21,7 @@ TEST(SCCIteratorTest, AllSmallGraphs) { // create graphs for which every node has a self-edge. #define NUM_NODES 4 #define NUM_GRAPHS (NUM_NODES * (NUM_NODES - 1)) - typedef Graph<NUM_NODES> GT; + using GT = Graph<NUM_NODES>; /// Enumerate all graphs using NUM_GRAPHS bits. static_assert(NUM_GRAPHS < sizeof(unsigned) * CHAR_BIT, "Too many graphs!"); diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp index 966b1f01e8a31..85567775e4ebd 100644 --- a/llvm/unittests/ADT/STLExtrasTest.cpp +++ b/llvm/unittests/ADT/STLExtrasTest.cpp @@ -60,7 +60,7 @@ TEST(STLExtrasTest, EnumerateLValue) { // Test that a simple LValue can be enumerated and gives correct results with // multiple types, including the empty container. std::vector<char> foo = {'a', 'b', 'c'}; - typedef std::pair<std::size_t, char> CharPairType; + using CharPairType = std::pair<std::size_t, char>; std::vector<CharPairType> CharResults; for (auto [index, value] : llvm::enumerate(foo)) { @@ -72,7 +72,7 @@ TEST(STLExtrasTest, EnumerateLValue) { CharPairType(2u, 'c'))); // Test a const range of a different type. - typedef std::pair<std::size_t, int> IntPairType; + using IntPairType = std::pair<std::size_t, int>; std::vector<IntPairType> IntResults; const std::vector<int> bar = {1, 2, 3}; for (auto [index, value] : llvm::enumerate(bar)) { @@ -111,7 +111,7 @@ TEST(STLExtrasTest, EnumerateModifyLValue) { TEST(STLExtrasTest, EnumerateRValueRef) { // Test that an rvalue can be enumerated. - typedef std::pair<std::size_t, int> PairType; + using PairType = std::pair<std::size_t, int>; std::vector<PairType> Results; auto Enumerator = llvm::enumerate(std::vector<int>{1, 2, 3}); @@ -138,7 +138,7 @@ TEST(STLExtrasTest, EnumerateModifyRValue) { // Test that when enumerating an rvalue, modification still works (even if // this isn't terribly useful, it at least shows that we haven't snuck an // extra const in there somewhere. - typedef std::pair<std::size_t, char> PairType; + using PairType = std::pair<std::size_t, char>; std::vector<PairType> Results; for (auto X : llvm::enumerate(std::vector<char>{'1', '2', '3'})) { diff --git a/llvm/unittests/ADT/SimpleIListTest.cpp b/llvm/unittests/ADT/SimpleIListTest.cpp index c2992baf8a5f7..cf3df8c293e25 100644 --- a/llvm/unittests/ADT/SimpleIListTest.cpp +++ b/llvm/unittests/ADT/SimpleIListTest.cpp @@ -605,8 +605,8 @@ struct Tag2 {}; struct DoubleNode : ilist_node<DoubleNode, ilist_tag<Tag1>>, ilist_node<DoubleNode, ilist_tag<Tag2>> { - typedef ilist_node<DoubleNode, ilist_tag<Tag1>> Node1Type; - typedef ilist_node<DoubleNode, ilist_tag<Tag2>> Node2Type; + using Node1Type = ilist_node<DoubleNode, ilist_tag<Tag1>>; + using Node2Type = ilist_node<DoubleNode, ilist_tag<Tag2>>; Node1Type::self_iterator getIterator1() { return Node1Type::getIterator(); } Node2Type::self_iterator getIterator2() { return Node2Type::getIterator(); } @@ -617,8 +617,8 @@ struct DoubleNode : ilist_node<DoubleNode, ilist_tag<Tag1>>, return Node2Type::getIterator(); } }; -typedef simple_ilist<DoubleNode, ilist_tag<Tag1>> TaggedList1Type; -typedef simple_ilist<DoubleNode, ilist_tag<Tag2>> TaggedList2Type; +using TaggedList1Type = simple_ilist<DoubleNode, ilist_tag<Tag1>>; +using TaggedList2Type = simple_ilist<DoubleNode, ilist_tag<Tag2>>; TEST(SimpleIListTest, TaggedLists) { TaggedList1Type L1; diff --git a/llvm/unittests/ADT/SmallPtrSetTest.cpp b/llvm/unittests/ADT/SmallPtrSetTest.cpp index a627091b90c70..fe7a8279d06b1 100644 --- a/llvm/unittests/ADT/SmallPtrSetTest.cpp +++ b/llvm/unittests/ADT/SmallPtrSetTest.cpp @@ -57,7 +57,7 @@ TEST(SmallPtrSetTest, GrowthTest) { SmallPtrSet<int *, 4> s; - typedef SmallPtrSet<int *, 4>::iterator iter; + using iter = SmallPtrSet<int *, 4>::iterator; s.insert(&buf[0]); s.insert(&buf[1]); diff --git a/llvm/unittests/ADT/SmallStringTest.cpp b/llvm/unittests/ADT/SmallStringTest.cpp index 2f4df8afeafa5..db858246c9bbf 100644 --- a/llvm/unittests/ADT/SmallStringTest.cpp +++ b/llvm/unittests/ADT/SmallStringTest.cpp @@ -23,7 +23,7 @@ namespace { // Test fixture class class SmallStringTest : public testing::Test { protected: - typedef SmallString<40> StringType; + using StringType = SmallString<40>; StringType theString; diff --git a/llvm/unittests/ADT/SmallVectorTest.cpp b/llvm/unittests/ADT/SmallVectorTest.cpp index 74fc737f29335..dbc626db54482 100644 --- a/llvm/unittests/ADT/SmallVectorTest.cpp +++ b/llvm/unittests/ADT/SmallVectorTest.cpp @@ -226,13 +226,10 @@ class SmallVectorTest : public SmallVectorTestBase { VectorT otherVector; }; - -typedef ::testing::Types<SmallVector<Constructable, 0>, - SmallVector<Constructable, 1>, - SmallVector<Constructable, 2>, - SmallVector<Constructable, 4>, - SmallVector<Constructable, 5> - > SmallVectorTestTypes; +using SmallVectorTestTypes = ::testing::Types< + SmallVector<Constructable, 0>, SmallVector<Constructable, 1>, + SmallVector<Constructable, 2>, SmallVector<Constructable, 4>, + SmallVector<Constructable, 5>>; TYPED_TEST_SUITE(SmallVectorTest, SmallVectorTestTypes, ); // Constructor test. @@ -537,11 +534,11 @@ TYPED_TEST(SmallVectorTest, AppendNonIterTest) { } struct output_iterator { - typedef std::output_iterator_tag iterator_category; - typedef int value_type; - typedef int difference_type; - typedef value_type *pointer; - typedef value_type &reference; + using iterator_category = std::output_iterator_tag; + using value_type = int; + using difference_type = int; + using pointer = value_type *; + using reference = value_type &; operator int() { return 2; } operator Constructable() { return 7; } }; @@ -896,7 +893,7 @@ class DualSmallVectorsTest<std::pair<VectorT1, VectorT2>> : public SmallVectorTe VectorT2 otherVector; }; -typedef ::testing::Types< +using DualSmallVectorTestTypes = ::testing::Types< // Small mode -> Small mode. std::pair<SmallVector<Constructable, 4>, SmallVector<Constructable, 4>>, // Small mode -> Big mode. @@ -904,8 +901,7 @@ typedef ::testing::Types< // Big mode -> Small mode. std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 4>>, // Big mode -> Big mode. - std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 2>> - > DualSmallVectorTestTypes; + std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 2>>>; TYPED_TEST_SUITE(DualSmallVectorsTest, DualSmallVectorTestTypes, ); diff --git a/llvm/unittests/ADT/SparseMultiSetTest.cpp b/llvm/unittests/ADT/SparseMultiSetTest.cpp index 54f7bc99b52fa..91d37f4684b9e 100644 --- a/llvm/unittests/ADT/SparseMultiSetTest.cpp +++ b/llvm/unittests/ADT/SparseMultiSetTest.cpp @@ -13,7 +13,7 @@ using namespace llvm; namespace { -typedef SparseMultiSet<unsigned> USet; +using USet = SparseMultiSet<unsigned>; // Empty set tests. TEST(SparseMultiSetTest, EmptySet) { @@ -211,7 +211,7 @@ struct Alt { }; TEST(SparseMultiSetTest, AltStructSet) { - typedef SparseMultiSet<Alt> ASet; + using ASet = SparseMultiSet<Alt>; ASet Set; Set.setUniverse(10); Set.insert(Alt(1005)); diff --git a/llvm/unittests/ADT/SparseSetTest.cpp b/llvm/unittests/ADT/SparseSetTest.cpp index 4fbf1caa247b7..f2b932907dc38 100644 --- a/llvm/unittests/ADT/SparseSetTest.cpp +++ b/llvm/unittests/ADT/SparseSetTest.cpp @@ -13,7 +13,7 @@ using namespace llvm; namespace { -typedef SparseSet<unsigned> USet; +using USet = SparseSet<unsigned>; // Empty set tests. TEST(SparseSetTest, EmptySet) { @@ -166,7 +166,7 @@ struct Alt { }; TEST(SparseSetTest, AltStructSet) { - typedef SparseSet<Alt> ASet; + using ASet = SparseSet<Alt>; ASet Set; Set.setUniverse(10); Set.insert(Alt(1005)); diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp index c94feb54d0b7d..75d50f4dd1b5b 100644 --- a/llvm/unittests/ADT/StringSwitchTest.cpp +++ b/llvm/unittests/ADT/StringSwitchTest.cpp @@ -240,6 +240,23 @@ TEST(StringSwitchTest, CasesCopies) { EXPECT_EQ(NumCopies, 1u); } +TEST(StringSwitchTest, StringSwitchMultipleMatches) { + auto Translate = [](StringRef S) { + return llvm::StringSwitch<int>(S) + .CaseLower("A", 0) + .Case("b", 1) + .Case("a", 2) + .CasesLower({"a", "b"}, 3) + .DefaultUnreachable(); + }; + + // Check that the value of the first match is returned. + EXPECT_EQ(0, Translate("A")); + EXPECT_EQ(0, Translate("a")); + EXPECT_EQ(3, Translate("B")); + EXPECT_EQ(1, Translate("b")); +} + TEST(StringSwitchTest, DefaultUnreachable) { auto Translate = [](StringRef S) { return llvm::StringSwitch<int>(S) diff --git a/llvm/unittests/ADT/TestGraph.h b/llvm/unittests/ADT/TestGraph.h index a59ab504f7144..bb2ec47a0d5fe 100644 --- a/llvm/unittests/ADT/TestGraph.h +++ b/llvm/unittests/ADT/TestGraph.h @@ -34,7 +34,7 @@ class Graph { /// NodeSubset - A subset of the graph's nodes. class NodeSubset { - typedef unsigned char BitVector; // Where the limitation N <= 8 comes from. + using BitVector = unsigned char; // Where the limitation N <= 8 comes from. BitVector Elements; NodeSubset(BitVector e) : Elements(e) {} public: @@ -96,7 +96,7 @@ class Graph { }; /// NodeType - Node index and set of children of the node. - typedef std::pair<unsigned, NodeSubset> NodeType; + using NodeType = std::pair<unsigned, NodeSubset>; private: /// Nodes - The list of nodes for this graph. @@ -233,8 +233,8 @@ class Graph { template <unsigned N> struct GraphTraits<Graph<N> > { - typedef typename Graph<N>::NodeType *NodeRef; - typedef typename Graph<N>::ChildIterator ChildIteratorType; + using NodeRef = typename Graph<N>::NodeType *; + using ChildIteratorType = typename Graph<N>::ChildIterator; static NodeRef getEntryNode(const Graph<N> &G) { return G.AccessNode(0); } static ChildIteratorType child_begin(NodeRef Node) { diff --git a/llvm/unittests/ADT/TinyPtrVectorTest.cpp b/llvm/unittests/ADT/TinyPtrVectorTest.cpp index af4ae4f4b0db9..c77721df5055c 100644 --- a/llvm/unittests/ADT/TinyPtrVectorTest.cpp +++ b/llvm/unittests/ADT/TinyPtrVectorTest.cpp @@ -28,14 +28,14 @@ template <typename PointerTy, unsigned IntBits, typename IntType, typename PtrTraits, typename Info> struct RemovePointer< PointerIntPair<PointerTy, IntBits, IntType, PtrTraits, Info>> { - typedef typename RemovePointer<PointerTy>::type type; + using type = typename RemovePointer<PointerTy>::type; }; template <typename VectorT> class TinyPtrVectorTest : public testing::Test { protected: - typedef typename VectorT::value_type PtrT; - typedef typename RemovePointer<PtrT>::type ValueT; + using PtrT = typename VectorT::value_type; + using ValueT = typename RemovePointer<PtrT>::type; using PtrTraits = PointerLikeTypeTraits<PtrT>; VectorT V; @@ -78,9 +78,9 @@ class TinyPtrVectorTest : public testing::Test { } }; -typedef ::testing::Types<TinyPtrVector<int *>, TinyPtrVector<double *>, - TinyPtrVector<PointerIntPair<int *, 1>>> - TinyPtrVectorTestTypes; +using TinyPtrVectorTestTypes = + ::testing::Types<TinyPtrVector<int *>, TinyPtrVector<double *>, + TinyPtrVector<PointerIntPair<int *, 1>>>; TYPED_TEST_SUITE(TinyPtrVectorTest, TinyPtrVectorTestTypes, ); TYPED_TEST(TinyPtrVectorTest, EmptyTest) { diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp index db67e30ca203b..692da230b6e09 100644 --- a/llvm/unittests/CAS/ActionCacheTest.cpp +++ b/llvm/unittests/CAS/ActionCacheTest.cpp @@ -21,7 +21,7 @@ using namespace llvm; using namespace llvm::cas; TEST_P(CASTest, ActionCacheHit) { - std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ObjectStore> CAS = createObjectStore(); std::unique_ptr<ActionCache> Cache = createActionCache(); std::optional<ObjectProxy> ID; @@ -36,7 +36,7 @@ TEST_P(CASTest, ActionCacheHit) { } TEST_P(CASTest, ActionCacheMiss) { - std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ObjectStore> CAS = createObjectStore(); std::unique_ptr<ActionCache> Cache = createActionCache(); std::optional<ObjectProxy> ID1, ID2; @@ -59,7 +59,7 @@ TEST_P(CASTest, ActionCacheMiss) { } TEST_P(CASTest, ActionCacheRewrite) { - std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ObjectStore> CAS = createObjectStore(); std::unique_ptr<ActionCache> Cache = createActionCache(); std::optional<ObjectProxy> ID1, ID2; diff --git a/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp new file mode 100644 index 0000000000000..19522e9372d85 --- /dev/null +++ b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "CASTestConfig.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::cas; + +TEST_F(OnDiskCASTest, UnifiedCASMaterializationCheckPreventsGarbageCollection) { + unittest::TempDir Temp("on-disk-unified-cas", /*Unique=*/true); + + auto WithCAS = [&](llvm::function_ref<void(ObjectStore &)> Action) { + std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>> DBs; + ASSERT_THAT_ERROR( + createOnDiskUnifiedCASDatabases(Temp.path()).moveInto(DBs), + Succeeded()); + ObjectStore &CAS = *DBs.first; + ASSERT_THAT_ERROR(CAS.setSizeLimit(1), Succeeded()); + Action(CAS); + }; + + std::optional<CASID> ID; + + // Create an object in the CAS. + WithCAS([&ID](ObjectStore &CAS) { + std::optional<ObjectRef> Ref; + ASSERT_THAT_ERROR(CAS.store({}, "blah").moveInto(Ref), Succeeded()); + ASSERT_TRUE(Ref.has_value()); + + ID = CAS.getID(*Ref); + }); + + // Check materialization and prune the storage. + WithCAS([&ID](ObjectStore &CAS) { + std::optional<ObjectRef> Ref = CAS.getReference(*ID); + ASSERT_TRUE(Ref.has_value()); + + std::optional<bool> IsMaterialized; + ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized), + Succeeded()); + ASSERT_TRUE(IsMaterialized); + + ASSERT_THAT_ERROR(CAS.pruneStorageData(), Succeeded()); + }); + + // Verify that the previous materialization check kept the object in the CAS. + WithCAS([&ID](ObjectStore &CAS) { + std::optional<ObjectRef> Ref = CAS.getReference(*ID); + ASSERT_TRUE(Ref.has_value()); + + std::optional<bool> IsMaterialized; + ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized), + Succeeded()); + ASSERT_TRUE(IsMaterialized); + }); +} diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp index 10e4b689e151e..08cbf1daf727d 100644 --- a/llvm/unittests/CAS/CASTestConfig.cpp +++ b/llvm/unittests/CAS/CASTestConfig.cpp @@ -8,6 +8,7 @@ #include "CASTestConfig.h" #include "llvm/CAS/ObjectStore.h" +#include "llvm/Testing/Support/Error.h" #include "gtest/gtest.h" #include <mutex> @@ -15,7 +16,8 @@ using namespace llvm; using namespace llvm::cas; static CASTestingEnv createInMemory(int I) { - return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache()}; + return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache(), + std::nullopt}; } INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest, @@ -23,7 +25,7 @@ INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest, #if LLVM_ENABLE_ONDISK_CAS namespace llvm::cas::ondisk { -extern void setMaxMappingSize(uint64_t Size); +void setMaxMappingSize(uint64_t Size); } // namespace llvm::cas::ondisk void setMaxOnDiskCASMappingSize() { @@ -31,6 +33,17 @@ void setMaxOnDiskCASMappingSize() { std::call_once( Flag, [] { llvm::cas::ondisk::setMaxMappingSize(100 * 1024 * 1024); }); } + +static CASTestingEnv createOnDisk(int I) { + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + std::unique_ptr<ObjectStore> CAS; + EXPECT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + std::unique_ptr<ActionCache> Cache; + EXPECT_THAT_ERROR(createOnDiskActionCache(Temp.path()).moveInto(Cache), + Succeeded()); + return CASTestingEnv{std::move(CAS), std::move(Cache), std::move(Temp)}; +} +INSTANTIATE_TEST_SUITE_P(OnDiskCAS, CASTest, ::testing::Values(createOnDisk)); #else void setMaxOnDiskCASMappingSize() {} #endif /* LLVM_ENABLE_ONDISK_CAS */ diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h index 8d3c55305f1b3..b1c0e59ff2b92 100644 --- a/llvm/unittests/CAS/CASTestConfig.h +++ b/llvm/unittests/CAS/CASTestConfig.h @@ -6,16 +6,28 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H +#define LLVM_UNITTESTS_CASTESTCONFIG_H + #include "llvm/CAS/ActionCache.h" #include "llvm/CAS/ObjectStore.h" +#include "llvm/Testing/Support/SupportHelpers.h" #include "gtest/gtest.h" +#include <memory> -#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H -#define LLVM_UNITTESTS_CASTESTCONFIG_H +namespace llvm::unittest::cas { +class MockEnv { + void anchor(); + +public: + virtual ~MockEnv(); +}; +} // namespace llvm::unittest::cas struct CASTestingEnv { std::unique_ptr<llvm::cas::ObjectStore> CAS; std::unique_ptr<llvm::cas::ActionCache> Cache; + std::optional<llvm::unittest::TempDir> Temp; }; void setMaxOnDiskCASMappingSize(); @@ -24,26 +36,47 @@ void setMaxOnDiskCASMappingSize(); class OnDiskCASTest : public ::testing::Test { protected: void SetUp() override { +#if !LLVM_ENABLE_ONDISK_CAS + GTEST_SKIP() << "OnDiskCAS is not enabled"; +#endif // Use a smaller database size for testing to conserve disk space. setMaxOnDiskCASMappingSize(); } }; +// Parametered test fixture for ObjectStore and ActionCache tests. class CASTest : public testing::TestWithParam<std::function<CASTestingEnv(int)>> { protected: std::optional<int> NextCASIndex; + llvm::SmallVector<llvm::unittest::TempDir> Dirs; + + llvm::SmallVector<std::unique_ptr<llvm::unittest::cas::MockEnv>> Envs; + std::unique_ptr<llvm::cas::ObjectStore> createObjectStore() { auto TD = GetParam()(++(*NextCASIndex)); + if (TD.Temp) + Dirs.push_back(std::move(*TD.Temp)); return std::move(TD.CAS); } std::unique_ptr<llvm::cas::ActionCache> createActionCache() { auto TD = GetParam()(++(*NextCASIndex)); + if (TD.Temp) + Dirs.push_back(std::move(*TD.Temp)); return std::move(TD.Cache); } - void SetUp() override { NextCASIndex = 0; } - void TearDown() override { NextCASIndex = std::nullopt; } + + void SetUp() override { + NextCASIndex = 0; + setMaxOnDiskCASMappingSize(); + } + + void TearDown() override { + NextCASIndex = std::nullopt; + Dirs.clear(); + Envs.clear(); + } }; #endif diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt index da469f7fccb5a..91e49be770745 100644 --- a/llvm/unittests/CAS/CMakeLists.txt +++ b/llvm/unittests/CAS/CMakeLists.txt @@ -1,9 +1,11 @@ set(ONDISK_CAS_TEST_SOURCES + BuiltinUnifiedCASDatabasesTest.cpp OnDiskGraphDBTest.cpp OnDiskDataAllocatorTest.cpp OnDiskKeyValueDBTest.cpp OnDiskTrieRawHashMapTest.cpp ProgramTest.cpp + UnifiedOnDiskCacheTest.cpp ) set(LLVM_OPTIONAL_SOURCES diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp index 54083fdb408f6..b43ae33d74127 100644 --- a/llvm/unittests/CAS/ObjectStoreTest.cpp +++ b/llvm/unittests/CAS/ObjectStoreTest.cpp @@ -1,4 +1,4 @@ -//===- ObjectStoreTest.cpp ------------------------------------------------===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -76,7 +76,7 @@ multiline text multiline text multiline text multiline text multiline text)", // Run validation on all CASIDs. for (int I = 0, E = IDs.size(); I != E; ++I) - ASSERT_THAT_ERROR(CAS1->validate(IDs[I]), Succeeded()); + ASSERT_THAT_ERROR(CAS1->validateObject(IDs[I]), Succeeded()); // Check that the blobs can be retrieved multiple times. for (int I = 0, E = IDs.size(); I != E; ++I) { @@ -120,15 +120,15 @@ TEST_P(CASTest, BlobsBig) { std::optional<CASID> ID2; ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID1), Succeeded()); ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID2), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded()); ASSERT_EQ(ID1, ID2); String1.append(String2); ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID1), Succeeded()); ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID2), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded()); ASSERT_EQ(ID1, ID2); String2.append(String1); } @@ -176,10 +176,11 @@ multiline text multiline text multiline text multiline text multiline text)", // Check basic printing of IDs. IDs.push_back(CAS1->getID(*Node)); - auto ID = CAS1->getID(Nodes.back()); - EXPECT_EQ(ID.toString(), IDs.back().toString()); - EXPECT_EQ(*Node, Nodes.back()); - EXPECT_EQ(ID, IDs.back()); + EXPECT_EQ(IDs.back().toString(), IDs.back().toString()); + EXPECT_EQ(Nodes.front(), Nodes.front()); + EXPECT_EQ(Nodes.back(), Nodes.back()); + EXPECT_EQ(IDs.front(), IDs.front()); + EXPECT_EQ(IDs.back(), IDs.back()); if (Nodes.size() <= 1) continue; EXPECT_NE(Nodes.front(), Nodes.back()); @@ -266,7 +267,7 @@ TEST_P(CASTest, NodesBig) { } for (auto ID : CreatedNodes) - ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(CAS->getID(ID)), Succeeded()); } #if LLVM_ENABLE_THREADS @@ -332,17 +333,124 @@ static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) { } TEST_P(CASTest, BlobsParallel) { - std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ObjectStore> CAS = createObjectStore(); uint64_t Size = 1ULL * 1024; ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size)); } #ifdef EXPENSIVE_CHECKS TEST_P(CASTest, BlobsBigParallel) { - std::shared_ptr<ObjectStore> CAS = createObjectStore(); + std::unique_ptr<ObjectStore> CAS = createObjectStore(); // 100k is large enough to be standalone files in our on-disk cas. uint64_t Size = 100ULL * 1024; ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size)); } #endif // EXPENSIVE_CHECKS + +#ifndef _WIN32 // create_link won't work for directories on Windows +TEST_F(OnDiskCASTest, OnDiskCASBlobsParallelMultiCAS) { + // This test intentionally uses symlinked paths to the same CAS to subvert the + // shared memory mappings that would normally be created within a single + // process. This breaks the lock file guarantees, so we must be careful not + // to create or destroy the CAS objects concurrently, which is when the locks + // are normally important. + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")), + std::error_code()); + + std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4), + Succeeded()); + + uint64_t Size = 1ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size)); +} + +TEST_F(OnDiskCASTest, OnDiskCASBlobsBigParallelMultiCAS) { + // See comment in BlobsParallelMultiCAS. + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")), + std::error_code()); + + std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4), + Succeeded()); + + // 100k is large enough to be standalone files in our on-disk cas. + uint64_t Size = 100ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size)); +} +#endif // _WIN32 #endif // LLVM_ENABLE_THREADS + +TEST_F(OnDiskCASTest, OnDiskCASDiskSize) { + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + std::unique_ptr<ObjectStore> CAS; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + + uint64_t MaxSize = 100 * 1024 * 1024; + + // Check that we map the files to the correct size. + auto CheckFileSizes = [&](bool Mapped) { + bool FoundIndex = false, FoundData = false; + std::error_code EC; + for (sys::fs::directory_iterator I(Temp.path(), EC), E; I != E && !EC; + I.increment(EC)) { + StringRef Filename = sys::path::filename(I->path()); + if (Filename.starts_with("index.") && !Filename.ends_with(".shared")) { + FoundIndex = true; + ASSERT_TRUE(I->status()); + if (Mapped) + EXPECT_EQ(I->status()->getSize(), MaxSize); + else + EXPECT_LT(I->status()->getSize(), MaxSize); + } + if (Filename.starts_with("data.") && !Filename.ends_with(".shared")) { + FoundData = true; + ASSERT_TRUE(I->status()); + if (Mapped) + EXPECT_EQ(I->status()->getSize(), MaxSize); + else + EXPECT_LT(I->status()->getSize(), MaxSize); + } + } + ASSERT_TRUE(FoundIndex); + ASSERT_TRUE(FoundData); + }; + + // Check that we have the full mapping size when the CAS is open. + CheckFileSizes(/*Mapped=*/true); + CAS.reset(); + // Check that the CAS is shrunk to a smaller size. + CheckFileSizes(/*Mapped=*/false); + + // Repeat the checks when starting from an existing CAS. + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + CheckFileSizes(/*Mapped=*/true); + CAS.reset(); + CheckFileSizes(/*Mapped=*/false); +} diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h index 89f93e08366c9..48a1830f9b219 100644 --- a/llvm/unittests/CAS/OnDiskCommonUtils.h +++ b/llvm/unittests/CAS/OnDiskCommonUtils.h @@ -12,6 +12,8 @@ #include "llvm/CAS/BuiltinObjectHasher.h" #include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/BLAKE3.h" #include "llvm/Testing/Support/Error.h" @@ -58,6 +60,25 @@ inline Expected<ObjectID> store(OnDiskGraphDB &DB, StringRef Data, return ID; } +inline Expected<ObjectID> cachePut(OnDiskKeyValueDB &DB, ArrayRef<uint8_t> Key, + ObjectID ID) { + auto Value = UnifiedOnDiskCache::getValueFromObjectID(ID); + auto Result = DB.put(Key, Value); + if (!Result) + return Result.takeError(); + return UnifiedOnDiskCache::getObjectIDFromValue(*Result); +} + +inline Expected<std::optional<ObjectID>> cacheGet(OnDiskKeyValueDB &DB, + ArrayRef<uint8_t> Key) { + auto Result = DB.get(Key); + if (!Result) + return Result.takeError(); + if (!*Result) + return std::nullopt; + return UnifiedOnDiskCache::getObjectIDFromValue(**Result); +} + inline Error printTree(OnDiskGraphDB &DB, ObjectID ID, raw_ostream &OS, unsigned Indent = 0) { std::optional<ondisk::ObjectHandle> Obj; diff --git a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp index 3c2e96318a5ed..e9c73bfb6c8d3 100644 --- a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp +++ b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp @@ -102,7 +102,7 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInSingleNode) { std::unique_ptr<OnDiskGraphDB> DB; ASSERT_THAT_ERROR( OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType), - std::move(UpstreamDB), + UpstreamDB.get(), OnDiskGraphDB::FaultInPolicy::SingleNode) .moveInto(DB), Succeeded()); @@ -208,7 +208,7 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInFullTree) { unittest::TempDir Temp("ondiskcas", /*Unique=*/true); std::unique_ptr<OnDiskGraphDB> DB; ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType), - std::move(UpstreamDB), + UpstreamDB.get(), OnDiskGraphDB::FaultInPolicy::FullTree) .moveInto(DB), Succeeded()); @@ -264,14 +264,14 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInPolicyConflict) { unittest::TempDir Temp("ondiskcas", /*Unique=*/true); std::unique_ptr<OnDiskGraphDB> DB; ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", - sizeof(HashType), - std::move(UpstreamDB), Policy1) + sizeof(HashType), UpstreamDB.get(), + Policy1) .moveInto(DB), Succeeded()); DB.reset(); ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", - sizeof(HashType), - std::move(UpstreamDB), Policy2) + sizeof(HashType), UpstreamDB.get(), + Policy2) .moveInto(DB), Failed()); }; diff --git a/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp new file mode 100644 index 0000000000000..09aebc2d4bc19 --- /dev/null +++ b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp @@ -0,0 +1,198 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "CASTestConfig.h" +#include "OnDiskCommonUtils.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; +using namespace llvm::unittest::cas; + +/// Visits all the files of a directory recursively and returns the sum of their +/// sizes. +static Expected<size_t> countFileSizes(StringRef Path) { + size_t TotalSize = 0; + std::error_code EC; + for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; + DirI.increment(EC)) { + if (DirI->type() == sys::fs::file_type::directory_file) { + Expected<size_t> Subsize = countFileSizes(DirI->path()); + if (!Subsize) + return Subsize.takeError(); + TotalSize += *Subsize; + continue; + } + ErrorOr<sys::fs::basic_file_status> Stat = DirI->status(); + if (!Stat) + return createFileError(DirI->path(), Stat.getError()); + TotalSize += Stat->getSize(); + } + if (EC) + return createFileError(Path, EC); + return TotalSize; +} + +TEST_F(OnDiskCASTest, UnifiedOnDiskCacheTest) { + unittest::TempDir Temp("ondisk-unified", /*Unique=*/true); + std::unique_ptr<UnifiedOnDiskCache> UniDB; + + const uint64_t SizeLimit = 1024ull * 64; + auto reopenDB = [&]() { + UniDB.reset(); + ASSERT_THAT_ERROR(UnifiedOnDiskCache::open(Temp.path(), SizeLimit, "blake3", + sizeof(HashType)) + .moveInto(UniDB), + Succeeded()); + }; + + reopenDB(); + + HashType RootHash; + HashType OtherHash; + HashType Key1Hash; + HashType Key2Hash; + { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional<ObjectID> ID1; + ASSERT_THAT_ERROR(store(DB, "1", {}).moveInto(ID1), Succeeded()); + std::optional<ObjectID> ID2; + ASSERT_THAT_ERROR(store(DB, "2", {}).moveInto(ID2), Succeeded()); + std::optional<ObjectID> IDRoot; + ASSERT_THAT_ERROR(store(DB, "root", {*ID1, *ID2}).moveInto(IDRoot), + Succeeded()); + ArrayRef<uint8_t> Digest = DB.getDigest(*IDRoot); + ASSERT_EQ(Digest.size(), RootHash.size()); + llvm::copy(Digest, RootHash.data()); + + std::optional<ObjectID> IDOther; + ASSERT_THAT_ERROR(store(DB, "other", {}).moveInto(IDOther), Succeeded()); + Digest = DB.getDigest(*IDOther); + ASSERT_EQ(Digest.size(), OtherHash.size()); + llvm::copy(Digest, OtherHash.data()); + + Key1Hash = digest("key1"); + std::optional<ObjectID> Val; + ASSERT_THAT_ERROR( + cachePut(UniDB->getKeyValueDB(), Key1Hash, *IDRoot).moveInto(Val), + Succeeded()); + EXPECT_EQ(IDRoot, Val); + + Key2Hash = digest("key2"); + std::optional<ObjectID> KeyID; + ASSERT_THAT_ERROR(DB.getReference(Key2Hash).moveInto(KeyID), Succeeded()); + ASSERT_THAT_ERROR(cachePut(UniDB->getKeyValueDB(), + UniDB->getGraphDB().getDigest(*KeyID), *ID1) + .moveInto(Val), + Succeeded()); + } + + auto checkTree = [&](const HashType &Digest, StringRef ExpectedTree) { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional<ObjectID> ID; + ASSERT_THAT_ERROR(DB.getReference(Digest).moveInto(ID), Succeeded()); + std::string PrintedTree; + raw_string_ostream OS(PrintedTree); + ASSERT_THAT_ERROR(printTree(DB, *ID, OS), Succeeded()); + EXPECT_EQ(PrintedTree, ExpectedTree); + }; + auto checkRootTree = [&]() { + return checkTree(RootHash, "root\n 1\n 2\n"); + }; + + auto checkKey = [&](const HashType &Key, StringRef ExpectedData) { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional<ObjectID> Val; + ASSERT_THAT_ERROR(cacheGet(UniDB->getKeyValueDB(), Key).moveInto(Val), + Succeeded()); + + ASSERT_TRUE(Val.has_value()); + std::optional<ondisk::ObjectHandle> Obj; + ASSERT_THAT_ERROR(DB.load(*Val).moveInto(Obj), Succeeded()); + EXPECT_EQ(toStringRef(DB.getObjectData(*Obj)), ExpectedData); + }; + + checkRootTree(); + checkTree(OtherHash, "other\n"); + checkKey(Key1Hash, "root"); + checkKey(Key2Hash, "1"); + + auto storeBigObject = [&](unsigned Index) { + SmallString<1000> Buf; + Buf.append(970, 'a'); + raw_svector_ostream(Buf) << Index; + std::optional<ObjectID> ID; + ASSERT_THAT_ERROR(store(UniDB->getGraphDB(), Buf, {}).moveInto(ID), + Succeeded()); + }; + + uint64_t PrevStoreSize = UniDB->getStorageSize(); + unsigned Index = 0; + while (!UniDB->hasExceededSizeLimit()) { + storeBigObject(Index++); + } + EXPECT_GT(UniDB->getStorageSize(), PrevStoreSize); + UniDB->setSizeLimit(SizeLimit * 2); + EXPECT_FALSE(UniDB->hasExceededSizeLimit()); + UniDB->setSizeLimit(SizeLimit); + EXPECT_TRUE(UniDB->hasExceededSizeLimit()); + + reopenDB(); + + EXPECT_FALSE(UniDB->hasExceededSizeLimit()); + EXPECT_FALSE(UniDB->needsGarbageCollection()); + + checkRootTree(); + checkKey(Key1Hash, "root"); + + while (!UniDB->hasExceededSizeLimit()) { + storeBigObject(Index++); + } + PrevStoreSize = UniDB->getStorageSize(); + ASSERT_THAT_ERROR(UniDB->close(), Succeeded()); + EXPECT_TRUE(UniDB->needsGarbageCollection()); + + reopenDB(); + EXPECT_TRUE(UniDB->needsGarbageCollection()); + + std::optional<size_t> DirSizeBefore; + ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeBefore), + Succeeded()); + + ASSERT_THAT_ERROR(UnifiedOnDiskCache::collectGarbage(Temp.path()), + Succeeded()); + + std::optional<size_t> DirSizeAfter; + ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeAfter), + Succeeded()); + EXPECT_LT(*DirSizeAfter, *DirSizeBefore); + + reopenDB(); + EXPECT_FALSE(UniDB->needsGarbageCollection()); + + checkRootTree(); + checkKey(Key1Hash, "root"); + + EXPECT_LT(UniDB->getStorageSize(), PrevStoreSize); + + // 'Other' tree and 'Key2' got garbage-collected. + { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional<ObjectID> ID; + ASSERT_THAT_ERROR(DB.getReference(OtherHash).moveInto(ID), Succeeded()); + EXPECT_FALSE(DB.containsObject(*ID)); + std::optional<ObjectID> Val; + ASSERT_THAT_ERROR(cacheGet(UniDB->getKeyValueDB(), Key2Hash).moveInto(Val), + Succeeded()); + EXPECT_FALSE(Val.has_value()); + } +} diff --git a/llvm/unittests/Support/Casting.cpp b/llvm/unittests/Support/Casting.cpp index 790675083614b..0df8b9fcab452 100644 --- a/llvm/unittests/Support/Casting.cpp +++ b/llvm/unittests/Support/Casting.cpp @@ -561,6 +561,47 @@ TEST(CastingTest, assertion_check_unique_ptr) { << "Invalid cast of const ref did not cause an abort()"; } +TEST(Casting, StaticCastPredicate) { + uint32_t Value = 1; + + static_assert( + std::is_same_v<decltype(StaticCastTo<uint64_t>(Value)), uint64_t>); +} + +TEST(Casting, LLVMRTTIPredicates) { + struct Base { + enum Kind { BK_Base, BK_Derived }; + const Kind K; + Base(Kind K = BK_Base) : K(K) {} + Kind getKind() const { return K; } + virtual ~Base() = default; + }; + + struct Derived : Base { + Derived() : Base(BK_Derived) {} + static bool classof(const Base *B) { return B->getKind() == BK_Derived; } + bool Field = false; + }; + + Base B; + Derived D; + Base *BD = &D; + Base *Null = nullptr; + + // Pointers. + EXPECT_EQ(DynCastTo<Derived>(BD), &D); + EXPECT_EQ(CastTo<Derived>(BD), &D); + EXPECT_EQ(DynCastTo<Derived>(&B), nullptr); + EXPECT_EQ(CastIfPresentTo<Derived>(BD), &D); + EXPECT_EQ(CastIfPresentTo<Derived>(Null), nullptr); + EXPECT_EQ(DynCastIfPresentTo<Derived>(BD), &D); + EXPECT_EQ(DynCastIfPresentTo<Derived>(Null), nullptr); + + Base &R = D; + CastTo<Derived>(R).Field = true; + EXPECT_TRUE(D.Field); +} + } // end namespace assertion_checks #endif } // end namespace diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp index 4b53cc3d6e516..4908eda16e002 100644 --- a/llvm/unittests/Transforms/Utils/LocalTest.cpp +++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp @@ -1153,7 +1153,7 @@ TEST(Local, ExpressionForConstant) { IntegerType *Int1Ty = Type::getInt1Ty(Context); Expr = createExpression(ConstantInt::getTrue(Context), Int1Ty); EXPECT_NE(Expr, nullptr); - EXPECT_EQ(Expr->getElement(1), 18446744073709551615U); + EXPECT_EQ(Expr->getElement(1), 1U); Expr = createExpression(ConstantInt::getFalse(Context), Int1Ty); EXPECT_NE(Expr, nullptr); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp index 46802826fe090..169114ed6c310 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp @@ -326,22 +326,18 @@ TEST_F(VPVerifierTest, NonHeaderPHIInHeader) { class VPIRVerifierTest : public VPlanTestIRBase {}; -TEST_F(VPIRVerifierTest, testVerifyIRPhi) { +TEST_F(VPIRVerifierTest, testVerifyIRPhiInScalarHeaderVPIRBB) { const char *ModuleString = "define void @f(ptr %A, i64 %N) {\n" "entry:\n" " br label %loop\n" "loop:\n" " %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]\n" - " %arr.idx = getelementptr inbounds i32, ptr %A, i64 %iv\n" - " %l1 = load i32, ptr %arr.idx, align 4\n" - " %res = add i32 %l1, 10\n" - " store i32 %res, ptr %arr.idx, align 4\n" " %iv.next = add i64 %iv, 1\n" " %exitcond = icmp ne i64 %iv.next, %N\n" " br i1 %exitcond, label %loop, label %for.end\n" "for.end:\n" - " %p = phi i32 [ %l1, %loop ]\n" + " %p = phi i64 [ %iv, %loop ]\n" " ret void\n" "}\n"; @@ -351,7 +347,48 @@ TEST_F(VPIRVerifierTest, testVerifyIRPhi) { BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); auto Plan = buildVPlan(LoopHeader); - Plan->getExitBlocks()[0]->front().addOperand(Plan->getConstantInt(32, 0)); +#if GTEST_HAS_STREAM_REDIRECTION + ::testing::internal::CaptureStderr(); +#endif + EXPECT_FALSE(verifyVPlanIsValid(*Plan)); +#if GTEST_HAS_STREAM_REDIRECTION + EXPECT_STREQ( + "Phi-like recipe with different number of operands and predecessors.\n", + ::testing::internal::GetCapturedStderr().c_str()); +#endif +} + +TEST_F(VPIRVerifierTest, testVerifyIRPhiInExitVPIRBB) { + const char *ModuleString = + "define void @f(ptr %A, i64 %N) {\n" + "entry:\n" + " br label %loop\n" + "loop:\n" + " %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]\n" + " %iv.next = add i64 %iv, 1\n" + " %exitcond = icmp ne i64 %iv.next, %N\n" + " br i1 %exitcond, label %loop, label %for.end\n" + "for.end:\n" + " %p = phi i64 [ %iv, %loop ]\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("f"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildVPlan(LoopHeader); + + // Create a definition in the vector loop header that will be used by the phi. + auto *HeaderBlock = + cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getEntry()); + VPInstruction *DefI = + new VPInstruction(VPInstruction::ExtractLastElement, + {HeaderBlock->front().getVPSingleValue()}); + DefI->insertBefore(Plan->getMiddleBlock()->getTerminator()); + Plan->getExitBlocks()[0]->front().addOperand(DefI); + VPValue *Zero = Plan->getConstantInt(32, 0); + Plan->getScalarHeader()->front().addOperand(Zero); #if GTEST_HAS_STREAM_REDIRECTION ::testing::internal::CaptureStderr(); diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp index cd866469792a2..ff894853b9771 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp @@ -407,6 +407,8 @@ void CodeGenIntrinsic::setProperty(const Record *R) { hasSideEffects = true; else if (R->getName() == "IntrStrictFP") isStrictFP = true; + else if (R->getName() == "IntrNoCreateUndefOrPoison") + isNoCreateUndefOrPoison = true; else if (R->isSubClassOf("NoCapture")) { unsigned ArgNo = R->getValueAsInt("ArgNo"); addArgAttribute(ArgNo, NoCapture); diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h index 2e86149514f46..15e803c4feba1 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h @@ -114,6 +114,9 @@ struct CodeGenIntrinsic { // True if the intrinsic is marked as strictfp. bool isStrictFP = false; + // True if the intrinsic is marked as IntrNoCreateUndefOrPoison. + bool isNoCreateUndefOrPoison = false; + enum ArgAttrKind { NoCapture, NoAlias, diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp index 3c6ff1132230b..d33bf45595e2e 100644 --- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp +++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp @@ -81,6 +81,7 @@ static void generateEnumExports(ArrayRef<const Record *> Records, std::string N = getIdentifierName(R, Prefix); OS << "constexpr auto " << N << " = " << Enum << "::" << N << ";\n"; } + OS << "\n"; } // Generate enum class. Entries are emitted in the order in which they appear @@ -88,7 +89,6 @@ static void generateEnumExports(ArrayRef<const Record *> Records, static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS, StringRef Enum, StringRef Prefix, bool ExportEnums) { - OS << "\n"; OS << "enum class " << Enum << " {\n"; if (!Records.empty()) { std::string N; @@ -104,17 +104,15 @@ static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS, OS << "};\n"; OS << "\n"; OS << "static constexpr std::size_t " << Enum - << "_enumSize = " << Records.size() << ";\n"; + << "_enumSize = " << Records.size() << ";\n\n"; // Make the enum values available in the defined namespace. This allows us to // write something like Enum_X if we have a `using namespace <CppNamespace>`. // At the same time we do not loose the strong type guarantees of the enum // class, that is we cannot pass an unsigned as Directive without an explicit // cast. - if (ExportEnums) { - OS << "\n"; + if (ExportEnums) generateEnumExports(Records, OS, Enum, Prefix); - } } // Generate enum class with values corresponding to different bit positions. @@ -127,7 +125,6 @@ static void generateEnumBitmask(ArrayRef<const Record *> Records, StringRef Type = Records.size() <= 32 ? "uint32_t" : "uint64_t"; StringRef TypeSuffix = Records.size() <= 32 ? "U" : "ULL"; - OS << "\n"; OS << "enum class " << Enum << " : " << Type << " {\n"; std::string LastName; for (auto [I, R] : llvm::enumerate(Records)) { @@ -138,17 +135,15 @@ static void generateEnumBitmask(ArrayRef<const Record *> Records, OS << "};\n"; OS << "\n"; OS << "static constexpr std::size_t " << Enum - << "_enumSize = " << Records.size() << ";\n"; + << "_enumSize = " << Records.size() << ";\n\n"; // Make the enum values available in the defined namespace. This allows us to // write something like Enum_X if we have a `using namespace <CppNamespace>`. // At the same time we do not loose the strong type guarantees of the enum // class, that is we cannot pass an unsigned as Directive without an explicit // cast. - if (ExportEnums) { - OS << "\n"; + if (ExportEnums) generateEnumExports(Records, OS, Enum, Prefix); - } } // Generate enums for values that clauses can take. @@ -170,7 +165,6 @@ static void generateClauseEnumVal(ArrayRef<const Record *> Records, return; } - OS << "\n"; OS << "enum class " << Enum << " {\n"; for (const EnumVal Val : ClauseVals) OS << " " << Val.getRecordName() << "=" << Val.getValue() << ",\n"; @@ -182,6 +176,7 @@ static void generateClauseEnumVal(ArrayRef<const Record *> Records, OS << "constexpr auto " << CV->getName() << " = " << Enum << "::" << CV->getName() << ";\n"; } + OS << "\n"; EnumHelperFuncs += (Twine("LLVM_ABI ") + Twine(Enum) + Twine(" get") + Twine(Enum) + Twine("(StringRef Str);\n")) .str(); @@ -284,7 +279,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { NamespaceEmitter DirLangNS(OS, DirLang.getCppNamespace()); if (DirLang.hasEnableBitmaskEnumInNamespace()) - OS << "\nLLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n"; + OS << "LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n\n"; // Emit Directive associations std::vector<const Record *> Associations; @@ -315,7 +310,6 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { generateClauseEnumVal(DirLang.getClauses(), OS, DirLang, EnumHelperFuncs); // Generic function signatures - OS << "\n"; OS << "// Enumeration helper functions\n"; OS << "LLVM_ABI std::pair<Directive, directive::VersionRange> get" << Lang @@ -353,10 +347,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { OS << "LLVM_ABI Association getDirectiveAssociation(Directive D);\n"; OS << "LLVM_ABI Category getDirectiveCategory(Directive D);\n"; OS << "LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);\n"; - if (EnumHelperFuncs.length() > 0) { - OS << EnumHelperFuncs; - OS << "\n"; - } + OS << EnumHelperFuncs; DirLangNS.close(); diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp index 75dffb18fca5a..452d2b08f25c3 100644 --- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp @@ -421,7 +421,8 @@ static bool compareFnAttributes(const CodeGenIntrinsic *L, return std::tie(I->canThrow, I->isNoDuplicate, I->isNoMerge, I->isNoReturn, I->isNoCallback, I->isNoSync, I->isNoFree, I->isWillReturn, I->isCold, I->isConvergent, I->isSpeculatable, - I->hasSideEffects, I->isStrictFP); + I->hasSideEffects, I->isStrictFP, + I->isNoCreateUndefOrPoison); }; auto TieL = TieBoolAttributes(L); @@ -446,7 +447,8 @@ static bool hasFnAttributes(const CodeGenIntrinsic &Int) { return !Int.canThrow || Int.isNoReturn || Int.isNoCallback || Int.isNoSync || Int.isNoFree || Int.isWillReturn || Int.isCold || Int.isNoDuplicate || Int.isNoMerge || Int.isConvergent || Int.isSpeculatable || - Int.isStrictFP || getEffectiveME(Int) != MemoryEffects::unknown(); + Int.isStrictFP || Int.isNoCreateUndefOrPoison || + getEffectiveME(Int) != MemoryEffects::unknown(); } namespace { @@ -605,6 +607,8 @@ static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) { addAttribute("Speculatable"); if (Int.isStrictFP) addAttribute("StrictFP"); + if (Int.isNoCreateUndefOrPoison) + addAttribute("NoCreateUndefOrPoison"); const MemoryEffects ME = getEffectiveME(Int); if (ME != MemoryEffects::unknown()) { diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py index 469e27facedb0..82377862885c8 100644 --- a/llvm/utils/UpdateTestChecks/asm.py +++ b/llvm/utils/UpdateTestChecks/asm.py @@ -51,9 +51,9 @@ class string: ) ASM_FUNCTION_AMDGPU_RE = re.compile( - r"\.type\s+_?(?P<func>[^,\n]+),@function\n" + r"\.type\s+(_|\.L)?(?P<func>[^,\n]+),@function\n" r"(^\s*\.amdgpu_hsa_kernel (?P=func)\n)?" - r'^_?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n' + r'^(_|\.L)?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n' r"(?P<body>.*?)\n" # (body of the function) # This list is incomplete r"^\s*(\.Lfunc_end[0-9]+:\n|\.section)", @@ -576,6 +576,7 @@ def get_run_handler(triple): "armv7-apple-ios": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_IOS_RE), "armv7-apple-darwin": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE), "armv7k-apple-watchos": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE), + "thumbv7k-apple-watchos": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE), "thumb": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE), "thumb-macho": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE), "thumbv5-macho": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE), diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index 2dad16a8eebb7..baa0377cd8b81 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -605,6 +605,7 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False): TRIPLE_ARG_RE = re.compile(r"-m?triple[= ]([^ ]+)") MARCH_ARG_RE = re.compile(r"-march[= ]([^ ]+)") DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)") +STOP_PASS_RE = re.compile(r"-stop-(before|after)=(\w+)") IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_") IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\S+") diff --git a/llvm/utils/UpdateTestChecks/mir.py b/llvm/utils/UpdateTestChecks/mir.py index 24bb8b341d335..01ee0e19f7cb9 100644 --- a/llvm/utils/UpdateTestChecks/mir.py +++ b/llvm/utils/UpdateTestChecks/mir.py @@ -163,13 +163,15 @@ def add_mir_checks_for_function( print_fixed_stack, first_check_is_next, at_the_function_name, + check_indent=None, ): printed_prefixes = set() for run in run_list: for prefix in run[0]: if prefix in printed_prefixes: break - if not func_dict[prefix][func_name]: + # func_info can be empty if there was a prefix conflict. + if not func_dict[prefix].get(func_name): continue if printed_prefixes: # Add some space between different check prefixes. @@ -185,6 +187,7 @@ def add_mir_checks_for_function( func_dict[prefix][func_name], print_fixed_stack, first_check_is_next, + check_indent, ) break else: @@ -204,6 +207,7 @@ def add_mir_check_lines( func_info, print_fixed_stack, first_check_is_next, + check_indent=None, ): func_body = str(func_info).splitlines() if single_bb: @@ -220,7 +224,10 @@ def add_mir_check_lines( first_line = func_body[0] indent = len(first_line) - len(first_line.lstrip(" ")) # A check comment, indented the appropriate amount - check = "{:>{}}; {}".format("", indent, prefix) + if check_indent is not None: + check = "{}; {}".format(check_indent, prefix) + else: + check = "{:>{}}; {}".format("", indent, prefix) output_lines.append("{}-LABEL: name: {}".format(check, func_name)) diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn index f280f695cd3ab..2f84999621e1b 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn @@ -32,6 +32,7 @@ static_library("bugprone") { "CopyConstructorInitCheck.cpp", "CrtpConstructorAccessibilityCheck.cpp", "DanglingHandleCheck.cpp", + "DefaultOperatorNewOnOveralignedTypeCheck.cpp", "DerivedMethodShadowingBaseMethodCheck.cpp", "DynamicStaticInitializersCheck.cpp", "EasilySwappableParametersCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn index 3ad0a83a8fb23..ec642b6afad66 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn @@ -16,7 +16,6 @@ static_library("cert") { ] sources = [ "CERTTidyModule.cpp", - "DefaultOperatorNewAlignmentCheck.cpp", "DontModifyStdNamespaceCheck.cpp", "FloatLoopCounter.cpp", "LimitedRandomnessCheck.cpp", diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn index a42f781271a21..b6c2f465a7292 100644 --- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn @@ -21,6 +21,7 @@ static_library("lib") { sources = [ "Breakpoint.cpp", "BreakpointBase.cpp", + "ClientLauncher.cpp", "CommandPlugins.cpp", "DAP.cpp", "DAPError.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn index 5590b27ac3784..1e0e918d3370c 100644 --- a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn @@ -4,14 +4,17 @@ static_library("CAS") { "ActionCache.cpp", "ActionCaches.cpp", "BuiltinCAS.cpp", + "BuiltinUnifiedCASDatabases.cpp", "DatabaseFile.cpp", "InMemoryCAS.cpp", "MappedFileRegionArena.cpp", "ObjectStore.cpp", + "OnDiskCAS.cpp", "OnDiskCommon.cpp", "OnDiskDataAllocator.cpp", "OnDiskGraphDB.cpp", "OnDiskKeyValueDB.cpp", "OnDiskTrieRawHashMap.cpp", + "UnifiedOnDiskCache.cpp", ] } diff --git a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn index 2d9eb6814c376..b10e0e6706cc3 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn @@ -9,8 +9,10 @@ unittest("CASTests") { ] sources = [ "ActionCacheTest.cpp", + "BuiltinUnifiedCASDatabasesTest.cpp", "CASTestConfig.cpp", "ObjectStoreTest.cpp", + "UnifiedOnDiskCacheTest.cpp", ] if (llvm_enable_ondisk_cas) { diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 3176b1a257434..64148c6098327 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -462,16 +462,15 @@ def executeBuiltinMkdir(cmd, cmd_shenv): stderr = StringIO() exitCode = 0 for dir in args: - cwd = cmd_shenv.cwd - dir = to_unicode(dir) if kIsWindows else to_bytes(dir) - cwd = to_unicode(cwd) if kIsWindows else to_bytes(cwd) - if not os.path.isabs(dir): - dir = lit.util.abs_path_preserve_drive(os.path.join(cwd, dir)) + dir = pathlib.Path(dir) + cwd = pathlib.Path(to_unicode(cmd_shenv.cwd)) + if not dir.is_absolute(): + dir = lit.util.abs_path_preserve_drive(cwd / dir) if parent: - lit.util.mkdir_p(dir) + dir.mkdir(parents=True, exist_ok=True) else: try: - lit.util.mkdir(dir) + dir.mkdir(exist_ok=True) except OSError as err: stderr.write("Error: 'mkdir' command failed, %s\n" % str(err)) exitCode = 1 @@ -2411,7 +2410,7 @@ def runOnce( return out, err, exitCode, timeoutInfo, status # Create the output directory if it does not already exist. - lit.util.mkdir_p(os.path.dirname(tmpBase)) + pathlib.Path(tmpBase).parent.mkdir(parents=True, exist_ok=True) # Re-run failed tests up to test.allowed_retries times. execdir = os.path.dirname(test.getExecPath()) diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py index ce4c3c2df3436..e4e031b3e0898 100644 --- a/llvm/utils/lit/lit/util.py +++ b/llvm/utils/lit/lit/util.py @@ -5,6 +5,7 @@ import math import numbers import os +import pathlib import platform import re import signal @@ -131,48 +132,17 @@ def abs_path_preserve_drive(path): # Since Python 3.8, os.path.realpath resolves sustitute drives, # so we should not use it. In Python 3.7, os.path.realpath # was implemented as os.path.abspath. + if isinstance(path, pathlib.Path): + return path.absolute() return os.path.abspath(path) else: # On UNIX, the current directory always has symbolic links resolved, # so any program accepting relative paths cannot preserve symbolic # links in paths and we should always use os.path.realpath. + if isinstance(path, pathlib.Path): + return path.resolve() return os.path.realpath(path) -def mkdir(path): - try: - if platform.system() == "Windows": - from ctypes import windll - from ctypes import GetLastError, WinError - - path = os.path.abspath(path) - # Make sure that the path uses backslashes here, in case - # python would have happened to use forward slashes, as the - # NT path format only supports backslashes. - path = path.replace("/", "\\") - NTPath = to_unicode(r"\\?\%s" % path) - if not windll.kernel32.CreateDirectoryW(NTPath, None): - raise WinError(GetLastError()) - else: - os.mkdir(path) - except OSError: - e = sys.exc_info()[1] - # ignore EEXIST, which may occur during a race condition - if e.errno != errno.EEXIST: - raise - - -def mkdir_p(path): - """mkdir_p(path) - Make the "path" directory, if it does not exist; this - will also make directories for any missing parent directories.""" - if not path or os.path.exists(path): - return - - parent = os.path.dirname(path) - if parent != path: - mkdir_p(parent) - - mkdir(path) - def listdir_files(dirname, suffixes=None, exclude_filenames=None, prefixes=None): """Yields files in a directory. diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt index d1329f5dbfaae..71972411bc4cf 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt @@ -1,2 +1,7 @@ ## Tests glob pattern handling in the mkdir command. + +## This mkdir should fail because the `example_file*.input`s are regular files. # RUN: not mkdir %S/example_file*.input + +## This mkdir should succeed (so RUN should fail) because the `example_dir*.input`s that already exist are directories. +# RUN: not mkdir %S/example_dir*.input diff --git a/llvm/utils/lit/tests/shtest-glob.py b/llvm/utils/lit/tests/shtest-glob.py index ae90f31907d49..aa4705b634a7d 100644 --- a/llvm/utils/lit/tests/shtest-glob.py +++ b/llvm/utils/lit/tests/shtest-glob.py @@ -1,12 +1,13 @@ ## Tests glob pattern handling in echo command. # RUN: not %{lit} -a -v %{inputs}/shtest-glob \ -# RUN: | FileCheck -dump-input=fail -match-full-lines %s -# +# RUN: | FileCheck -dump-input=fail -match-full-lines --implicit-check-not=Error: %s # END. # CHECK: UNRESOLVED: shtest-glob :: glob-echo.txt ({{[^)]*}}) # CHECK: TypeError: string argument expected, got 'GlobItem' -# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}}) -# CHECK: # error: command failed with exit status: 1 +# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}}) +# CHECK: # | Error: 'mkdir' command failed, {{.*}}example_file1.input' +# CHECK-NEXT: # | Error: 'mkdir' command failed, {{.*}}example_file2.input' +# CHECK: # error: command failed with exit status: 1 diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py index 286fd3d7e173e..d81cde0159792 100644 --- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py +++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py @@ -2,7 +2,7 @@ # ulimit does not work on non-POSIX platforms. # These tests are specific to options that Darwin does not support. -# UNSUPPORTED: system-windows, system-darwin, system-aix, system-solaris +# UNSUPPORTED: system-windows, system-cygwin, system-darwin, system-aix, system-solaris # RUN: not %{lit} -a -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt index 380b162d8c58c..53757c73fb8a6 100644 --- a/llvm/utils/profcheck-xfail.txt +++ b/llvm/utils/profcheck-xfail.txt @@ -75,6 +75,7 @@ CodeGen/Hexagon/masked_gather.ll CodeGen/NVPTX/lower-ctor-dtor.ll CodeGen/RISCV/zmmul.ll CodeGen/WebAssembly/memory-interleave.ll +CodeGen/X86/AMX/amx-low-intrinsics.ll CodeGen/X86/masked_gather_scatter.ll CodeGen/X86/nocfivalue.ll DebugInfo/AArch64/ir-outliner.ll @@ -530,32 +531,6 @@ Instrumentation/TypeSanitizer/swifterror.ll LTO/X86/diagnostic-handler-remarks-with-hotness.ll Other/optimization-remarks-auto.ll Other/X86/debugcounter-partiallyinlinelibcalls.ll -tools/llvm-objcopy/ELF/auto-remove-add-symtab-shndx.test -tools/UpdateTestChecks/update_analyze_test_checks/loop-access-analysis.test -tools/UpdateTestChecks/update_analyze_test_checks/loop-distribute.test -tools/UpdateTestChecks/update_test_checks/argument_name_reuse.test -tools/UpdateTestChecks/update_test_checks/basic.test -tools/UpdateTestChecks/update_test_checks/check_attrs.test -tools/UpdateTestChecks/update_test_checks/difile_absolute_filenames.test -tools/UpdateTestChecks/update_test_checks/filter_out_after.test -tools/UpdateTestChecks/update_test_checks/generated_funcs_prefix_reuse.test -tools/UpdateTestChecks/update_test_checks/generated_funcs.test -tools/UpdateTestChecks/update_test_checks/global_preserve_name.test -tools/UpdateTestChecks/update_test_checks/if_target.test -tools/UpdateTestChecks/update_test_checks/named_function_arguments_split.test -tools/UpdateTestChecks/update_test_checks/on_the_fly_arg_change.test -tools/UpdateTestChecks/update_test_checks/phi-labels.test -tools/UpdateTestChecks/update_test_checks/pre-process.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values2.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values3.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values4.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values5.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values6.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test -tools/UpdateTestChecks/update_test_checks/stable_ir_values.test -tools/UpdateTestChecks/update_test_checks/switch_case.test -tools/UpdateTestChecks/update_test_checks/tbaa-semantics-checks.test -tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll Transforms/AtomicExpand/AArch64/pcsections.ll @@ -917,7 +892,6 @@ Transforms/InstCombine/select_frexp.ll Transforms/InstCombine/select.ll Transforms/InstCombine/select-min-max.ll Transforms/InstCombine/select-of-symmetric-selects.ll -Transforms/InstCombine/select-safe-transforms.ll Transforms/InstCombine/select-select.ll Transforms/InstCombine/select-with-extreme-eq-cond.ll Transforms/InstCombine/shift.ll @@ -1324,6 +1298,7 @@ Transforms/SimpleLoopUnswitch/trivial-unswitch.ll Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll +Transforms/StructurizeCFG/callbr.ll Transforms/StructurizeCFG/hoist-zerocost.ll Transforms/StructurizeCFG/loop-break-phi.ll Transforms/StructurizeCFG/nested-loop-order.ll diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat index 001339f2a8f05..3aeedaa3288a2 100644 --- a/llvm/utils/release/build_llvm_release.bat +++ b/llvm/utils/release/build_llvm_release.bat @@ -1,6 +1,9 @@ @echo off -setlocal enabledelayedexpansion +REM Filter out tests that are known to fail. +set "LIT_FILTER_OUT=gh110231.cpp|crt_initializers.cpp|init-order-atexit.cpp|use_after_return_linkage.cpp|initialization-bug.cpp|initialization-bug-no-global.cpp|trace-malloc-unbalanced.test|trace-malloc-2.test|TraceMallocTest" + +setlocal enabledelayedexpansion goto begin :usage @@ -24,6 +27,7 @@ echo. echo Example: build_llvm_release.bat --version 15.0.0 --x86 --x64 exit /b 1 + :begin ::============================================================================== @@ -163,7 +167,8 @@ set common_cmake_flags=^ -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^ -DLLVM_ENABLE_RPMALLOC=ON ^ -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld" ^ - -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp" + -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp" ^ + -DCOMPILER_RT_BUILD_ORC=OFF if "%force-msvc%" == "" ( where /q clang-cl @@ -224,7 +229,7 @@ ninja || ninja || ninja || exit /b 1 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +REM ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 cd.. @@ -249,7 +254,7 @@ ninja || ninja || ninja || exit /b 1 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1 -ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1 +REM ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1 ninja package || exit /b 1 cd .. diff --git a/llvm/utils/update_llc_test_checks.py b/llvm/utils/update_llc_test_checks.py index 8c57e75f34f75..98864be62875b 100755 --- a/llvm/utils/update_llc_test_checks.py +++ b/llvm/utils/update_llc_test_checks.py @@ -15,7 +15,7 @@ import os # Used to advertise this file's name ("autogenerated_note"). import sys -from UpdateTestChecks import common +from UpdateTestChecks import common, mir # llc is the only llc-like in the LLVM tree but downstream forks can add # additional ones here if they have them. @@ -33,6 +33,7 @@ def update_test(ti: common.TestInfo): break run_list = [] + mir_run_list = [] for l in ti.run_lines: if "|" not in l: common.warn("Skipping unparsable RUN line: " + l) @@ -57,9 +58,14 @@ def update_test(ti: common.TestInfo): if m: march_in_cmd = m.groups()[0] + target_list = run_list m = common.DEBUG_ONLY_ARG_RE.search(llc_cmd) if m and m.groups()[0] == "isel": from UpdateTestChecks import isel as output_type + elif not m and common.STOP_PASS_RE.search(llc_cmd): + # MIR output mode. If -debug-only is present assume + # the debug output is the main point of interest. + target_list = mir_run_list else: from UpdateTestChecks import asm as output_type @@ -84,7 +90,7 @@ def update_test(ti: common.TestInfo): # FIXME: We should use multiple check prefixes to common check lines. For # now, we just ignore all but the last. - run_list.append( + target_list.append( ( check_prefixes, llc_tool, @@ -119,14 +125,20 @@ def update_test(ti: common.TestInfo): ginfo=ginfo, ) - for ( - prefixes, - llc_tool, - llc_args, - preprocess_cmd, - triple_in_cmd, - march_in_cmd, - ) in run_list: + # Dictionary to store MIR function bodies separately + mir_func_dict = {} + for run_tuple, is_mir in [(run, False) for run in run_list] + [ + (run, True) for run in mir_run_list + ]: + ( + prefixes, + llc_tool, + llc_args, + preprocess_cmd, + triple_in_cmd, + march_in_cmd, + ) = run_tuple + common.debug("Extracted LLC cmd:", llc_tool, llc_args) common.debug("Extracted FileCheck prefixes:", str(prefixes)) @@ -141,22 +153,54 @@ def update_test(ti: common.TestInfo): if not triple: triple = common.get_triple_from_march(march_in_cmd) - scrubber, function_re = output_type.get_run_handler(triple) - if 0 == builder.process_run_line( - function_re, scrubber, raw_tool_output, prefixes - ): - common.warn( - "Couldn't match any function. Possibly the wrong target triple has been provided" + if is_mir: + # MIR output mode + common.debug("Detected MIR output mode for prefixes:", str(prefixes)) + for prefix in prefixes: + if prefix not in mir_func_dict: + mir_func_dict[prefix] = {} + + mir.build_function_info_dictionary( + ti.path, + raw_tool_output, + triple, + prefixes, + mir_func_dict, + ti.args.verbose, ) - builder.processed_prefixes(prefixes) + else: + # ASM output mode + scrubber, function_re = output_type.get_run_handler(triple) + if 0 == builder.process_run_line( + function_re, scrubber, raw_tool_output, prefixes + ): + common.warn( + "Couldn't match any function. Possibly the wrong target triple has been provided" + ) + builder.processed_prefixes(prefixes) func_dict = builder.finish_and_get_func_dict() + + # Check for conflicts: same prefix used for both ASM and MIR + conflicting_prefixes = set(func_dict.keys()) & set(mir_func_dict.keys()) + if conflicting_prefixes: + common.warn( + "The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: {}".format( + ", ".join(sorted(conflicting_prefixes)) + ), + test_file=ti.path, + ) + for prefix in conflicting_prefixes: + mir_func_dict[prefix] = {} + func_dict[prefix] = {} + global_vars_seen_dict = {} is_in_function = False is_in_function_start = False func_name = None prefix_set = set([prefix for p in run_list for prefix in p[0]]) + prefix_set.update([prefix for p in mir_run_list for prefix in p[0]]) common.debug("Rewriting FileCheck prefixes:", str(prefix_set)) output_lines = [] @@ -221,6 +265,22 @@ def update_test(ti: common.TestInfo): is_filtered=builder.is_filtered(), ) ) + + # Also add MIR checks if we have them for this function + if mir_run_list and func_name: + mir.add_mir_checks_for_function( + ti.path, + output_lines, + mir_run_list, + mir_func_dict, + func_name, + single_bb=False, # Don't skip basic block labels. + print_fixed_stack=False, # Don't print fixed stack (ASM tests don't need it). + first_check_is_next=False, # First check is LABEL, not NEXT. + at_the_function_name=False, # Use "name:" not "@name". + check_indent="", # No indentation for IR files (not MIR files). + ) + is_in_function_start = False if is_in_function: diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h index 3c87c453a4cf0..5b7b45fdd1d58 100644 --- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h @@ -127,6 +127,18 @@ class AbstractDenseForwardDataFlowAnalysis : public DataFlowAnalysis { /// them into the same equivalent class. virtual void buildOperationEquivalentLatticeAnchor(Operation *op) {} + /// Visit a block and propagate the dense lattice forward along the control + /// flow edge from predecessor to block. `point` corresponds to the program + /// point before `block`. The default implementation merges in the state from + /// the predecessor's terminator. + virtual void visitBlockTransfer(Block *block, ProgramPoint *point, + Block *predecessor, + const AbstractDenseLattice &before, + AbstractDenseLattice *after) { + // Merge in the state from the predecessor's terminator. + join(after, before); + } + /// Propagate the dense lattice forward along the control flow edge from /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt` /// values correspond to control flow branches originating at or targeting the @@ -259,6 +271,22 @@ class DenseForwardDataFlowAnalysis branch, regionFrom, regionTo, before, after); } + /// Hook for customizing the behavior of lattice propagation along the control + /// flow edges between blocks. The control flows from `predecessor` to + /// `block`. The lattice is propagated forward along this edge. The lattices + /// are as follows: + /// - `before` is the lattice at the end of the predecessor block; + /// - `after` is the lattice at the beginning of the block. + /// By default, the `after` state is simply joined with the `before` state. + /// Concrete analyses can override this behavior or delegate to the parent + /// call for the default behavior. + virtual void visitBlockTransfer(Block *block, ProgramPoint *point, + Block *predecessor, const LatticeT &before, + LatticeT *after) { + AbstractDenseForwardDataFlowAnalysis::visitBlockTransfer( + block, point, predecessor, before, after); + } + protected: /// Get the dense lattice on this lattice anchor. LatticeT *getLattice(LatticeAnchor anchor) override { @@ -306,6 +334,13 @@ class DenseForwardDataFlowAnalysis static_cast<const LatticeT &>(before), static_cast<LatticeT *>(after)); } + void visitBlockTransfer(Block *block, ProgramPoint *point, Block *predecessor, + const AbstractDenseLattice &before, + AbstractDenseLattice *after) final { + visitBlockTransfer(block, point, predecessor, + static_cast<const LatticeT &>(before), + static_cast<LatticeT *>(after)); + } }; //===----------------------------------------------------------------------===// @@ -388,6 +423,17 @@ class AbstractDenseBackwardDataFlowAnalysis : public DataFlowAnalysis { /// them into the same equivalent class. virtual void buildOperationEquivalentLatticeAnchor(Operation *op) {} + /// Visit a block and propagate the dense lattice backward along the control + /// flow edge from successor to block. `point` corresponds to the program + /// point after `block`. The default implementation merges in the state from + /// the successor's first operation or the block itself when empty. + virtual void visitBlockTransfer(Block *block, ProgramPoint *point, + Block *successor, + const AbstractDenseLattice &after, + AbstractDenseLattice *before) { + meet(before, after); + } + /// Propagate the dense lattice backwards along the control flow edge from /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt` /// values correspond to control flow branches originating at or targeting the @@ -531,6 +577,22 @@ class DenseBackwardDataFlowAnalysis branch, regionFrom, regionTo, after, before); } + /// Hook for customizing the behavior of lattice propagation along the control + /// flow edges between blocks. The control flows from `successor` to + /// `block`. The lattice is propagated back along this edge. The lattices + /// are as follows: + /// - `after` is the lattice at the beginning of the successor block; + /// - `before` is the lattice at the end of the block. + /// By default, the `before` state is simply met with the `after` state. + /// Concrete analyses can override this behavior or delegate to the parent + /// call for the default behavior. + virtual void visitBlockTransfer(Block *block, ProgramPoint *point, + Block *successor, const LatticeT &after, + LatticeT *before) { + AbstractDenseBackwardDataFlowAnalysis::visitBlockTransfer( + block, point, successor, after, before); + } + protected: /// Get the dense lattice at the given lattice anchor. LatticeT *getLattice(LatticeAnchor anchor) override { @@ -577,6 +639,13 @@ class DenseBackwardDataFlowAnalysis static_cast<const LatticeT &>(after), static_cast<LatticeT *>(before)); } + void visitBlockTransfer(Block *block, ProgramPoint *point, Block *successor, + const AbstractDenseLattice &after, + AbstractDenseLattice *before) final { + visitBlockTransfer(block, point, successor, + static_cast<const LatticeT &>(after), + static_cast<LatticeT *>(before)); + } }; } // end namespace dataflow diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h index 964281592cc65..cad6cec761ab8 100644 --- a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h +++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h @@ -92,12 +92,43 @@ class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> { using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern; using Super = VectorConvertToLLVMPattern<SourceOp, TargetOp>; + /// Return the given type if it's a floating point type. If the given type is + /// a vector type, return its element type if it's a floating point type. + static FloatType getFloatingPointType(Type type) { + if (auto floatType = dyn_cast<FloatType>(type)) + return floatType; + if (auto vecType = dyn_cast<VectorType>(type)) + return dyn_cast<FloatType>(vecType.getElementType()); + return nullptr; + } + LogicalResult matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { static_assert( std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value, "expected single result op"); + + // The pattern should not apply if a floating-point operand is converted to + // a non-floating-point type. This indicates that the floating point type + // is not supported by the LLVM lowering. (Such types are converted to + // integers.) + auto checkType = [&](Value v) -> LogicalResult { + FloatType floatType = getFloatingPointType(v.getType()); + if (!floatType) + return success(); + Type convertedType = this->getTypeConverter()->convertType(floatType); + if (!isa_and_nonnull<FloatType>(convertedType)) + return rewriter.notifyMatchFailure(op, + "unsupported floating point type"); + return success(); + }; + for (Value operand : op->getOperands()) + if (failed(checkType(operand))) + return failure(); + if (failed(checkType(op->getResult(0)))) + return failure(); + // Determine attributes for the target op AttrConvert<SourceOp, TargetOp> attrConvert(op); diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index ba5e48e4ec9ba..10f0cc254ea97 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -476,9 +476,9 @@ def ReduxKind : I32EnumAttr<"ReduxKind", "NVVM redux kind", def ReduxKindAttr : EnumAttr<NVVM_Dialect, ReduxKind, "redux_kind">; def NVVM_ReduxOp : - NVVM_Op<"redux.sync", [NVVMRequiresSM<80>]>, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_Type:$val, + NVVM_Op<"redux.sync", [NVVMRequiresSM<80>, AllTypesMatch<["res", "val"]>]>, + Results<(outs AnyTypeOf<[I32, F32]>:$res)>, + Arguments<(ins AnyTypeOf<[I32, F32]>:$val, ReduxKindAttr:$kind, I32:$mask_and_clamp, DefaultValuedAttr<BoolAttr, "false">:$abs, @@ -496,6 +496,8 @@ def NVVM_ReduxOp : [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync) }]; + let hasVerifier = 1; + string llvmBuilder = [{ auto intId = getReduxIntrinsicId($_resultType, $kind, $abs, $nan); $res = createIntrinsicCall(builder, intId, {$val, $mask_and_clamp}); @@ -656,8 +658,8 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">, } def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_AnyPointer:$addr)> { + Results<(outs I64:$res)>, + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> { let summary = "MBarrier Arrive Operation"; let description = [{ The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the @@ -674,36 +676,32 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">, value are implementation-specific. The operation takes the following operand: - - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. + - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) }]; - string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive, {$addr}); - }]; let assemblyFormat = "$addr attr-dict `:` type($addr) `->` type($res)"; -} - -def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_PointerShared:$addr)> { - let summary = "Shared MBarrier Arrive Operation"; - let description = [{ - This Op is the same as `nvvm.mbarrier.arrive` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_shared, {$addr}); + auto [id, args] = NVVM::MBarrierArriveOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + $res = createIntrinsicCall(builder, id, args); }]; - let assemblyFormat = "$addr attr-dict `:` qualified(type($addr)) `->` type($res)"; } def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_AnyPointer:$addr, I32:$count)> { + Results<(outs I64:$res)>, + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, + I32:$count)> { let summary = "MBarrier Arrive No-Complete Operation"; let description = [{ The `nvvm.mbarrier.arrive.nocomplete` operation performs an arrive-on operation @@ -721,33 +719,29 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">, captures the phase of the *mbarrier object* prior to the arrive-on operation. The operation takes the following operands: - - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic - addressing, but the address must still be in the shared memory space. + - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr` + must be a pointer to generic or shared::cta memory. When it is generic, the + underlying address must be within the shared::cta memory space; otherwise + the behavior is undefined. - `count`: Integer specifying the count argument to the arrive-on operation. Must be in the valid range as specified in the *mbarrier object* contents. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) }]; - string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete, {$addr, $count}); - }]; - let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)"; -} -def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.shared">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_PointerShared:$addr, I32:$count)> { - let summary = "Shared MBarrier Arrive No-Complete Operation"; - let description = [{ - This Op is the same as `nvvm.mbarrier.arrive.nocomplete` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. + let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)"; - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared, {$addr, $count}); + auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + $res = createIntrinsicCall(builder, id, args); }]; - let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)"; } def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">, @@ -896,8 +890,9 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p } def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_AnyPointer:$addr, LLVM_Type:$state)> { + Results<(outs I1:$res)>, + Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, + I64:$state)> { let summary = "MBarrier Non-Blocking Test Wait Operation"; let description = [{ The `nvvm.mbarrier.test.wait` operation performs a non-blocking test for the @@ -944,26 +939,20 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">, [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait) }]; - string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $state}); - }]; - let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)"; -} -def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_PointerShared:$addr, LLVM_Type:$state)> { - let summary = "Shared MBarrier Non-Blocking Test Wait Operation"; - let description = [{ - This Op is the same as `nvvm.mbarrier.test.wait` except that the *mbarrier object* - should be accessed using a shared-memory pointer instead of a generic-memory pointer. + let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)"; - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait) + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); }]; + string llvmBuilder = [{ - $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $state}); + auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + $res = createIntrinsicCall(builder, id, args); }]; - let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)"; } //===----------------------------------------------------------------------===// @@ -1326,9 +1315,9 @@ def ShflKindAttr : EnumAttr<NVVM_Dialect, ShflKind, "shfl_kind">; def NVVM_ShflOp : NVVM_Op<"shfl.sync", [NVVMRequiresSM<30>]>, - Results<(outs LLVM_Type:$res)>, + Results<(outs AnyTypeOf<[I32, F32, LLVMStructType]>:$res)>, Arguments<(ins I32:$thread_mask, - LLVM_Type:$val, + AnyTypeOf<[I32, F32]>:$val, I32:$offset, I32:$mask_and_clamp, ShflKindAttr:$kind, @@ -1344,6 +1333,11 @@ def NVVM_ShflOp : a mask for logically splitting warps into sub-segments and an upper bound for clamping the source lane index. + The `return_value_and_is_valid` unit attribute can be specified to indicate + that the return value is a two-element struct, where the first element is + the result value and the second element is a predicate indicating if the + computed source lane index is valid. + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync) }]; string llvmBuilder = [{ @@ -1534,47 +1528,30 @@ def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> { The `cp.async.mbarrier.arrive` Op makes the *mbarrier object* track all prior cp.async operations initiated by the executing thread. The `addr` operand specifies the address of the *mbarrier object* - in generic address space. The `noinc` attr impacts how the - mbarrier's state is updated. + in generic or shared::cta address space. When it is generic, the + underlying memory should fall within the shared::cta space; + otherwise the behavior is undefined. The `noinc` attr impacts + how the mbarrier's state is updated. [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive) }]; - let assemblyFormat = "$addr attr-dict `:` type(operands)"; let arguments = (ins - LLVM_AnyPointer:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc); - - string llvmBuilder = [{ - auto intId = $noinc ? - llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc : - llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive; + AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr, + DefaultValuedAttr<I1Attr, "0">:$noinc); - createIntrinsicCall(builder, intId, {$addr}); - }]; -} - -def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.shared"> { - let summary = "NVVM Dialect Op for cp.async.mbarrier.arrive.shared"; - let description = [{ - The `cp.async.mbarrier.arrive.shared` Op makes the *mbarrier object* - track all prior cp.async operations initiated by the executing thread. - The `addr` operand specifies the address of the *mbarrier object* in - shared memory. The `noinc` attr impacts how the mbarrier's state - is updated. - - [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive) - }]; let assemblyFormat = "$addr attr-dict `:` type(operands)"; - let arguments = (ins - LLVM_PointerShared:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc); + let extraClassDeclaration = [{ + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); + }]; string llvmBuilder = [{ - auto intId = $noinc ? - llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared : - llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared; - - createIntrinsicCall(builder, intId, {$addr}); + auto [id, args] = NVVM::CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); }]; } @@ -1589,10 +1566,11 @@ def FPRoundingModeRM : I32EnumAttrCase<"RM", 2, "rm">; def FPRoundingModeRP : I32EnumAttrCase<"RP", 3, "rp">; def FPRoundingModeRZ : I32EnumAttrCase<"RZ", 4, "rz">; def FPRoundingModeRNA : I32EnumAttrCase<"RNA", 5, "rna">; +def FPRoundingModeRS : I32EnumAttrCase<"RS", 6, "rs">; def FPRoundingMode : I32EnumAttr<"FPRoundingMode", "NVVM FPRoundingMode kind", [FPRoundingModeNone, FPRoundingModeRN, FPRoundingModeRM, - FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA]> { + FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA, FPRoundingModeRS]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::NVVM"; } @@ -1906,6 +1884,96 @@ def NVVM_ConvertF6x2ToF16x2Op : def NVVM_ConvertF4x2ToF16x2Op : NVVM_ConvertToFP16x2Op_Base<"F4", I8, "F16">; +//===----------------------------------------------------------------------===// +// NVVM Stochastic Rounding Conversion Ops +//===----------------------------------------------------------------------===// + +// Base class for conversions from F32x2 to FPx2 formats +// (F16x2, BF16x2) +// TODO: In separate PR, add .rn and .rz rounding variants for this conversion +// as currently only support .rs rounding mode +class NVVM_ConvertF32x2ToFPx2OpBase<string dstFormat, string mnemonic, Type dstType> : + NVVM_Op<mnemonic, [Pure, NVVMRequiresSMa<[100, 103]>]>, + Results<(outs dstType:$dst)>, + Arguments<(ins F32:$src_hi, F32:$src_lo, I32:$rbits, + DefaultValuedAttr<FPRoundingModeAttr, "FPRoundingMode::RS">:$rnd, + DefaultValuedAttr<SaturationModeAttr, "SaturationMode::NONE">:$sat, + DefaultValuedAttr<BoolAttr, "false">:$relu)> { + let summary = "Convert two F32 values to packed " # dstFormat # " with stochastic rounding (.rs)"; + let description = [{ + Converts two F32 values to packed }] # dstFormat # [{ format using stochastic + rounding (.rs) mode with randomness provided by the `rbits` parameter. The + `relu` attribute clamps negative results to 0. The `sat` attribute determines + saturation behavior. The `src_hi` and `src_lo` parameters correspond to operands + `a` and `b` in the PTX ISA, respectively. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt) + }]; + + let assemblyFormat = "$src_hi `,` $src_lo `,` $rbits attr-dict `:` type($dst)"; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + llvm::Intrinsic::ID getIntrinsicID(); + }]; + + string llvmBuilder = [{ + auto intId = op.getIntrinsicID(); + $dst = createIntrinsicCall(builder, intId, {$src_hi, $src_lo, $rbits}); + }]; + } + +// F32x2 -> F16x2 with stochastic rounding +def NVVM_ConvertF32x2ToF16x2Op : NVVM_ConvertF32x2ToFPx2OpBase<"f16x2", "convert.f32x2.to.f16x2", VectorOfLengthAndType<[2], [F16]>>; + +// F32x2 -> BF16x2 with stochastic rounding +def NVVM_ConvertF32x2ToBF16x2Op : NVVM_ConvertF32x2ToFPx2OpBase<"bf16x2", "convert.f32x2.to.bf16x2", VectorOfLengthAndType<[2], [BF16]>>; + +// Base class for stochastic rounding conversions from F32x4 to FPx4 formats +// (E4M3x4, E5M2x4, E2M3x4, E3M2x4, E2M1x4) +// These operations always use RS (stochastic rounding) mode with SATFINITE saturation. +class NVVM_ConvertF32x4ToFPx4OpBase<string dstFormat, string mnemonic, Type dstType> : + NVVM_Op<mnemonic, [Pure, NVVMRequiresSMa<[100, 103]>]>, + Results<(outs dstType:$dst)>, + Arguments<(ins VectorOfLengthAndType<[4], [F32]>:$src, I32:$rbits, + DefaultValuedAttr<BoolAttr, "false">:$relu, + TypeAttr:$dstTy)> { + let summary = "Convert vector<4xf32> to packed " # dstFormat # " with stochastic rounding (.rs) and satfinite"; + let description = [{ + Converts a vector<4xf32> to packed }] # dstFormat # [{ format using + stochastic rounding (.rs) mode with SATFINITE saturation. Randomness is + provided by the `rbits` parameter. The `dstTy` attribute specifies the + target floating-point format. The `relu` attribute clamps negative results to 0. + + Note: These operations always use RS rounding mode and SATFINITE saturation mode. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt) + }]; + + let assemblyFormat = "$src `,` $rbits attr-dict `:` type($src) `->` type($dst) `(` $dstTy `)`"; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + llvm::Intrinsic::ID getIntrinsicID(); + }]; + + string llvmBuilder = [{ + auto intId = op.getIntrinsicID(); + $dst = createIntrinsicCall(builder, intId, {$src, $rbits}); + }]; +} + +// F32x4 -> F8x4 with stochastic rounding (supports E4M3FN, E5M2) +def NVVM_ConvertF32x4ToF8x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f8x4", "convert.f32x4.to.f8x4", VectorOfLengthAndType<[4], [I8]>>; + +// F32x4 -> F6x4 with stochastic rounding (supports E2M3FN, E3M2FN) +def NVVM_ConvertF32x4ToF6x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f6x4", "convert.f32x4.to.f6x4", VectorOfLengthAndType<[4], [I8]>>; + +// F32x4 -> F4x4 with stochastic rounding (supports E2M1FN) +def NVVM_ConvertF32x4ToF4x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f4x4", "convert.f32x4.to.f4x4", I16>; + //===----------------------------------------------------------------------===// // NVVM MMA Ops //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 5241f9a6f2b43..921fdf36a59b0 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -692,6 +692,38 @@ def ROCDL_GlobalLoadLDSOp : }]; } +//===---------------------------------------------------------------------===// +// Async load to LDS intrinsic (available in GFX1250) +//===---------------------------------------------------------------------===// + +foreach bitsVal = [8, 32, 64, 128] in { + defvar bitsStr = "b" # !cast<string>(bitsVal); + def ROCDL_GlobalLoadAsyncToLDS # !toupper(bitsStr) # Op : + ROCDL_IntrOp<"global.load.async.to.lds." # bitsStr, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> { + dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr, + Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr, + I32Attr:$offset, + I32Attr:$aux); + let arguments = !con(args, baseArgs); + let assemblyFormat = [{ + $globalPtr `,` $ldsPtr `,` $offset `,` $aux + attr-dict `:` type($globalPtr) `,` type($ldsPtr) + }]; + let description = [{ + Asynchronously loads }] # !cast<string>(bitsVal) # [{ bits of data from a global memory pointer + to a Local Data Share (LDS) pointer. + + Available on gfx1250+. + }]; + + let extraClassDefinition = [{ + ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() { + return {getGlobalPtr(), getLdsPtr()}; + } + }]; + } +} + //===---------------------------------------------------------------------===// // Tensor load/store intrinsics (available in GFX1250) //===---------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 2f4517ddfe754..c689b7e46ea9e 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -2557,6 +2557,12 @@ def OpenACC_LoopOp : OpenACC_Op<"loop", device-type-aware getter methods. When modifying these operands, the corresponding `device_type` attributes must be updated to maintain consistency between operands and their target device types. + + The `unstructured` attribute indicates that the loops inside the OpenACC + construct contain early exits and cannot be lowered to structured MLIR + operations. When this flag is set, the acc.loop should have no induction + variables and the loop must be implemented via explicit control flow + inside its body. }]; let arguments = (ins @@ -2590,7 +2596,8 @@ def OpenACC_LoopOp : OpenACC_Op<"loop", OptionalAttr<SymbolRefArrayAttr>:$firstprivatizationRecipes, Variadic<AnyType>:$reductionOperands, OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes, - OptionalAttr<OpenACC_CombinedConstructsAttr>:$combined + OptionalAttr<OpenACC_CombinedConstructsAttr>:$combined, + UnitAttr:$unstructured ); let results = (outs Variadic<AnyType>:$results); diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 1481859e94a92..0c059967bb898 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -30,9 +30,11 @@ class SliceAttr; } // namespace xegpu } // namespace mlir +// clang-format off +#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc> #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc> #include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc> -#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc> +// clang-format on #define GET_ATTRDEF_CLASSES #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc> diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 40352b44b6441..9c35c07a7e587 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -223,17 +223,17 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> { InterfaceMethod<"Derive a new layout by dropping InstData", "xegpu::DistributeLayoutAttr", "dropInstData">, - InterfaceMethod<[{Delinearizes a linear subgroup ID into its multidimensional - indices based on the effective subgroup layout.}], + InterfaceMethod<[{Delinearizes a linear ID into its multidimensional + indices based on the effective layout level.}], "FailureOr<SmallVector<Value>>", - "delinearizeSubgroupId", + "delinearizeId", (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>, - InterfaceMethod<[{Generates instructions to compute multidimensional offsets for blocks - assigned to a subgroup identified by linearId. The shape parameter - represents the workgroup-level problem size. Each subgroup may access + InterfaceMethod<[{Generates instructions to compute multidimensional coordinates for dist units + assigned to a level identified by linearId. The shape parameter + represents the higher-level problem size. Each level may access multiple blocks according to round-robin distribution rules.}], "FailureOr<SmallVector<SmallVector<Value>>>", - "getOffsets", + "computeDistributedCoords", (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>, InterfaceMethod</*desc=*/[{Check if this layout can be achieved by applying a transpose to some other layout according to given permutation of (0...n-1).}], @@ -476,17 +476,17 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> { return {}; } - /// Delinearizes a linear subgroup ID into its multidimensional indices - /// based on the effective subgroup layout. + /// Delinearizes a linear ID into its multidimensional indices + /// based on the effective level of the layout. FailureOr<SmallVector<Value>> - delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId); + delinearizeId(OpBuilder &builder, Location loc, Value linearId); - /// Generates instructions to compute multidimensional offsets for blocks - /// assigned to a subgroup identified by linearId. The shape parameter - /// represents the workgroup-level problem size. Each subgroup may access + /// Generates instructions to compute multidimensional coordinates for dist units + /// assigned to a level identified by linearId. The shape parameter + /// represents the higher-level problem size. Each `level` may access /// multiple blocks according to round-robin distribution rules. FailureOr<SmallVector<SmallVector<Value>>> - getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape); + computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape); /// Check if this is slice of some other layout. bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; } @@ -643,14 +643,15 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> { /// Delinearizes a linear subgroup ID into its multidimensional indices /// based on the effective subgroup layout. FailureOr<SmallVector<Value>> - delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId); + delinearizeId(OpBuilder &builder, Location loc, Value linearId); - /// Generates instructions to compute multidimensional offsets for blocks + /// Generates instructions to compute multidimensional coordinates for blocks /// assigned to a subgroup identified by linearId. The shape parameter /// represents the workgroup-level problem size. Each subgroup may access /// multiple blocks according to round-robin distribution rules. + FailureOr<SmallVector<SmallVector<Value>>> - getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape); + computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape); /// Check if this is slice of some other layout. bool isSliceOf(const xegpu::DistributeLayoutAttr &other); diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 426377fcf598f..689ebd0d1179a 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -843,7 +843,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size, OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint, OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint, - OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint); + OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint, + OptionalAttr<XeGPU_LayoutAttr>:$layout); let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value); let extraClassDeclaration = extraBaseClassDeclaration # [{ @@ -895,7 +896,14 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> { "IntegerAttr": $chunk_size, "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)> + "xegpu::CachePolicyAttr": $l3_hint)>, + OpBuilder<(ins "Type": $value, "Value": $source, + "ArrayRef<OpFoldResult>": $offsets, "Value": $mask, + "IntegerAttr": $chunk_size, + "xegpu::CachePolicyAttr": $l1_hint, + "xegpu::CachePolicyAttr": $l2_hint, + "xegpu::CachePolicyAttr": $l3_hint, + "xegpu::LayoutAttr": $layout)> ]; let hasVerifier = 1; @@ -979,7 +987,8 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size, OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint, OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint, - OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint); + OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint, + OptionalAttr<XeGPU_LayoutAttr>:$layout); let extraClassDeclaration = extraBaseClassDeclaration#[{ Type getDestType() { @@ -1030,7 +1039,14 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> { "IntegerAttr": $chunk_size, "xegpu::CachePolicyAttr": $l1_hint, "xegpu::CachePolicyAttr": $l2_hint, - "xegpu::CachePolicyAttr": $l3_hint)> + "xegpu::CachePolicyAttr": $l3_hint)>, + OpBuilder<(ins "Value": $value, "Value": $dest, + "ArrayRef<OpFoldResult>": $offsets, "Value": $mask, + "IntegerAttr": $chunk_size, + "xegpu::CachePolicyAttr": $l1_hint, + "xegpu::CachePolicyAttr": $l2_hint, + "xegpu::CachePolicyAttr": $l3_hint, + "xegpu::LayoutAttr": $layout)> ]; let hasVerifier = 1; diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index b7af5413669c9..e42799689e490 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -26,7 +26,7 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> { The pass distributes subgroup level (SIMD) XeGPU ops to work items. }]; let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", - "vector::VectorDialect"]; + "vector::VectorDialect", "index::IndexDialect"]; } def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> { @@ -85,4 +85,16 @@ def XeGPUVectorLinearize : Pass<"xegpu-vector-linearize"> { "scf::SCFDialect", "ub::UBDialect", "vector::VectorDialect"]; } +def XeGPUOptimizeBlockLoads : Pass<"xegpu-optimize-block-loads"> { + let summary = "Optimize XeGPU block load operations"; + let description = [{ + This pass rewrites XeGPU loadNd operations into more optimal forms + to improve performance. This includes, + - Rewriting transpose B loads into more optimal forms to use HW block + transpose instructions for better performance. + }]; + let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect", + "vector::VectorDialect"]; +} + #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h index a480195eebd00..1776a209d0bf1 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h @@ -61,7 +61,8 @@ struct UnrollOptions { /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`. void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns); - +/// Appends patterns for optimizing block load operations into `patterns`. +void populateXeGPUOptimizeBlockLoadsPatterns(RewritePatternSet &patterns); /// Appends patterns for XeGPU SIMT distribution into `patterns`. void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns); /// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op. diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 04cfd58d846a7..58092c3bb9ed2 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -104,11 +104,15 @@ void removeLayoutAttrs(Operation *op); /// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching /// it to the owner's dictionary attributes +/// If `respectPermLayout` is true the existing permament layout +/// attribute will be kept and assigned to the attribute dict instead +/// of the provided layout. template <typename T, typename = std::enable_if_t<std::is_same_v<T, OpOperand> || std::is_same_v<T, OpResult>>> void setDistributeLayoutAttr(const T &operandOrResult, - const DistributeLayoutAttr layout); + const DistributeLayoutAttr layout, + bool respectPermLayout = false); /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given /// operation. If the operation contains regions, it is also applied recursively @@ -162,6 +166,15 @@ SmallVector<OpFoldResult> addElementwise(OpBuilder &builder, Location loc, SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef<OpFoldResult> lhs, ArrayRef<OpFoldResult> rhs); + +/// Helper Function to find a proper instruction multiple for the user-supplied +/// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes. +/// `candidateMultiples` are uArch multiples of such shapes (i.e. block count or +/// array length). +template <typename T> +int getLargestDivisor(T dim, ArrayRef<T> candidates, + ArrayRef<T> candidateMultiples = {}); + } // namespace xegpu } // namespace mlir diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index ed7e2a08ebfd9..5ac9e26e8636d 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -981,6 +981,28 @@ class ConversionPatternRewriter final : public PatternRewriter { /// Return a reference to the internal implementation. detail::ConversionPatternRewriterImpl &getImpl(); + /// Attempt to legalize the given operation. This can be used within + /// conversion patterns to change the default pre-order legalization order. + /// Returns "success" if the operation was legalized, "failure" otherwise. + /// + /// Note: In a partial conversion, this function returns "success" even if + /// the operation could not be legalized, as long as it was not explicitly + /// marked as illegal in the conversion target. + LogicalResult legalize(Operation *op); + + /// Attempt to legalize the given region. This can be used within + /// conversion patterns to change the default pre-order legalization order. + /// Returns "success" if the region was legalized, "failure" otherwise. + /// + /// If the current pattern runs with a type converter, the entry block + /// signature will be converted before legalizing the operations in the + /// region. + /// + /// Note: In a partial conversion, this function returns "success" even if + /// an operation could not be legalized, as long as it was not explicitly + /// marked as illegal in the conversion target. + LogicalResult legalize(Region *r); + private: // Allow OperationConverter to construct new rewriters. friend struct OperationConverter; @@ -989,7 +1011,8 @@ class ConversionPatternRewriter final : public PatternRewriter { /// conversions. They apply some IR rewrites in a delayed fashion and could /// bring the IR into an inconsistent state when used standalone. explicit ConversionPatternRewriter(MLIRContext *ctx, - const ConversionConfig &config); + const ConversionConfig &config, + OperationConverter &converter); // Hide unsupported pattern rewriter API. using OpBuilder::setListener; diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp index 0682e5f26785a..22bc0b32a9bd1 100644 --- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp @@ -266,9 +266,10 @@ void AbstractDenseForwardDataFlowAnalysis::visitBlock(Block *block) { } LDBG() << " Joining state from predecessor " << predecessor; + const AbstractDenseLattice &before = *getLatticeFor( + point, getProgramPointAfter(predecessor->getTerminator())); // Merge in the state from the predecessor's terminator. - join(after, *getLatticeFor( - point, getProgramPointAfter(predecessor->getTerminator()))); + visitBlockTransfer(block, point, predecessor, before, after); } } @@ -614,7 +615,9 @@ void AbstractDenseBackwardDataFlowAnalysis::visitBlock(Block *block) { LDBG() << " Meeting state from successor " << successor; // Merge in the state from the successor: either the first operation, or the // block itself when empty. - meet(before, *getLatticeFor(point, getProgramPointBefore(successor))); + visitBlockTransfer(block, point, successor, + *getLatticeFor(point, getProgramPointBefore(successor)), + before); } } diff --git a/mlir/lib/AsmParser/Parser.cpp b/mlir/lib/AsmParser/Parser.cpp index 82bdb844480f1..74936e32bd9d9 100644 --- a/mlir/lib/AsmParser/Parser.cpp +++ b/mlir/lib/AsmParser/Parser.cpp @@ -407,8 +407,8 @@ Parser::parseFloatFromIntegerLiteral(std::optional<APFloat> &result, "hexadecimal float constant out of range for type"); } - APInt truncatedValue(typeSizeInBits, intValue.getNumWords(), - intValue.getRawData()); + APInt truncatedValue(typeSizeInBits, + ArrayRef(intValue.getRawData(), intValue.getNumWords())); result.emplace(semantics, truncatedValue); return success(); } diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index ec182f1db48ac..9348d3c172a07 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -865,13 +865,7 @@ struct NVGPUMBarrierArriveLowering adaptor.getMbarId(), rewriter); Type tokenType = getTypeConverter()->convertType( nvgpu::MBarrierTokenType::get(op->getContext())); - if (isMbarrierShared(op.getBarriers().getType())) { - rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveSharedOp>(op, tokenType, - barrier); - } else { - rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType, - barrier); - } + rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType, barrier); return success(); } }; @@ -892,13 +886,8 @@ struct NVGPUMBarrierArriveNoCompleteLowering Type tokenType = getTypeConverter()->convertType( nvgpu::MBarrierTokenType::get(op->getContext())); Value count = truncToI32(b, adaptor.getCount()); - if (isMbarrierShared(op.getBarriers().getType())) { - rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>( - op, tokenType, barrier, count); - } else { - rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>( - op, tokenType, barrier, count); - } + rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>( + op, tokenType, barrier, count); return success(); } }; @@ -915,13 +904,8 @@ struct NVGPUMBarrierTestWaitLowering getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(), adaptor.getMbarId(), rewriter); Type retType = rewriter.getI1Type(); - if (isMbarrierShared(op.getBarriers().getType())) { - rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitSharedOp>( - op, retType, barrier, adaptor.getToken()); - } else { - rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>( - op, retType, barrier, adaptor.getToken()); - } + rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(op, retType, barrier, + adaptor.getToken()); return success(); } }; diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index 91c1aa55fdb4e..1b4d1a42614ea 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -97,57 +97,23 @@ static LogicalResult transferPreconditions(PatternRewriter &rewriter, return success(); } -static xegpu::CreateNdDescOp -createNdDescriptor(PatternRewriter &rewriter, Location loc, - xegpu::TensorDescType descType, TypedValue<MemRefType> src, - Operation::operand_range offsets) { +static xegpu::CreateNdDescOp createNdDescriptor(PatternRewriter &rewriter, + Location loc, + xegpu::TensorDescType descType, + TypedValue<MemRefType> src) { MemRefType srcTy = src.getType(); auto [strides, offset] = srcTy.getStridesAndOffset(); xegpu::CreateNdDescOp ndDesc; if (srcTy.hasStaticShape()) { - ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src, - getAsOpFoldResult(offsets)); + ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src); } else { // In case of any dynamic shapes, source's shape and strides have to be // explicitly provided. - SmallVector<Value> sourceDims; - unsigned srcRank = srcTy.getRank(); - for (unsigned i = 0; i < srcRank; ++i) - sourceDims.push_back(memref::DimOp::create(rewriter, loc, src, i)); - - SmallVector<int64_t> constOffsets; - SmallVector<Value> dynOffsets; - for (Value offset : offsets) { - std::optional<int64_t> staticVal = getConstantIntValue(offset); - if (!staticVal) - dynOffsets.push_back(offset); - constOffsets.push_back(staticVal.value_or(ShapedType::kDynamic)); - } - - SmallVector<Value> dynShapes; - for (auto [idx, shape] : llvm::enumerate(srcTy.getShape())) { - if (shape == ShapedType::kDynamic) - dynShapes.push_back(sourceDims[idx]); - } - - // Compute strides in reverse order. - SmallVector<Value> dynStrides; - Value accStride = arith::ConstantIndexOp::create(rewriter, loc, 1); - // Last stride is guaranteed to be static and unit. - for (int i = static_cast<int>(strides.size()) - 2; i >= 0; --i) { - accStride = - arith::MulIOp::create(rewriter, loc, accStride, sourceDims[i + 1]); - if (strides[i] == ShapedType::kDynamic) - dynStrides.push_back(accStride); - } - std::reverse(dynStrides.begin(), dynStrides.end()); - - ndDesc = xegpu::CreateNdDescOp::create( - rewriter, loc, descType, src, dynOffsets, dynShapes, dynStrides, - DenseI64ArrayAttr::get(rewriter.getContext(), constOffsets), - DenseI64ArrayAttr::get(rewriter.getContext(), srcTy.getShape()), - DenseI64ArrayAttr::get(rewriter.getContext(), strides)); + auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, src); + ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src, + meta.getConstifiedMixedSizes(), + meta.getConstifiedMixedStrides()); } return ndDesc; @@ -392,6 +358,62 @@ static Value computeOffsets(PatternRewriter &rewriter, OpType gatScatOp, .getResult(); } +// Collapses shapes of a nD memref to the target rank while applying offsets for +// the collapsed dimensions. Returns the new memref value and the remaining +// offsets for the last targetRank dimensions. For example: +// input: %memref = memref<2x4x8x32xf32>, offsets=[%i0, %i1, %i2, %i3], +// output: %memref[%i0, %i1, 0, 0] -> memref<8x32xf32>, offsets: [%i2, %i3] +static std::pair<Value, SmallVector<OpFoldResult>> +convertMemrefAndOffsetsToTargetRank(PatternRewriter &rewriter, Location loc, + Value memref, + SmallVector<OpFoldResult> offsets, + int64_t targetRank) { + auto memrefType = cast<MemRefType>(memref.getType()); + unsigned rank = memrefType.getRank(); + + if (rank <= targetRank) + return {memref, offsets}; + + int64_t numCombinedDims = rank - targetRank; + SmallVector<OpFoldResult> subviewOffsets; + SmallVector<OpFoldResult> subviewSizes; + SmallVector<OpFoldResult> subviewStrides; + + // For the combined dimensions: use the provided offsets, size=1, stride=1 + for (unsigned i = 0; i < numCombinedDims; ++i) { + subviewOffsets.push_back(offsets[i]); + subviewSizes.push_back(rewriter.getI64IntegerAttr(1)); + subviewStrides.push_back(rewriter.getI64IntegerAttr(1)); + } + + // For the last targetRank dimensions: offset=0, use full size, stride=1 + SmallVector<int64_t> resultShape; + auto originalShape = memrefType.getShape(); + auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, memref); + for (unsigned i = numCombinedDims; i < rank; ++i) { + subviewOffsets.push_back(rewriter.getI64IntegerAttr(0)); + if (ShapedType::isDynamic(originalShape[i])) { + subviewSizes.push_back(meta.getSizes()[i]); + resultShape.push_back(ShapedType::kDynamic); + } else { + subviewSizes.push_back(rewriter.getI64IntegerAttr(originalShape[i])); + resultShape.push_back(originalShape[i]); + } + subviewStrides.push_back(rewriter.getI64IntegerAttr(1)); + } + + auto resultType = memref::SubViewOp::inferRankReducedResultType( + resultShape, memrefType, subviewOffsets, subviewSizes, subviewStrides); + auto subviewOp = + memref::SubViewOp::create(rewriter, loc, resultType, memref, + subviewOffsets, subviewSizes, subviewStrides); + + // Return the remaining offsets for the last targetRank dimensions + SmallVector<OpFoldResult> newOffsets(offsets.begin() + numCombinedDims, + offsets.end()); + return {subviewOp.getResult(), newOffsets}; +} + template < typename OpType, typename = std::enable_if_t<llvm::is_one_of< @@ -435,7 +457,8 @@ static LogicalResult lowerToScatteredLoadOp(vector::TransferReadOp readOp, /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); rewriter.replaceOp(readOp, gatherOp.getResult()); return success(); @@ -469,7 +492,8 @@ static LogicalResult lowerToScatteredStoreOp(vector::TransferWriteOp writeOp, /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); rewriter.eraseOp(writeOp); return success(); } @@ -523,18 +547,19 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> { descShape, elementType, /*array_length=*/1, /*boundary_check=*/isOutOfBounds, xegpu::MemorySpace::Global); - xegpu::CreateNdDescOp ndDesc = - createNdDescriptor(rewriter, loc, descType, - dyn_cast<TypedValue<MemRefType>>(readOp.getBase()), - readOp.getIndices()); - DenseI64ArrayAttr transposeAttr = !isTransposeLoad ? nullptr : DenseI64ArrayAttr::get(rewriter.getContext(), ArrayRef<int64_t>{1, 0}); + auto [src, indices] = convertMemrefAndOffsetsToTargetRank( + rewriter, loc, readOp.getBase(), getAsOpFoldResult(readOp.getIndices()), + vecTy.getRank()); // By default, no specific caching policy is assigned. xegpu::CachePolicyAttr hint = nullptr; - auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, + xegpu::CreateNdDescOp ndDesc = createNdDescriptor( + rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src)); + + auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, indices, /*packed=*/nullptr, transposeAttr, /*l1_hint=*/hint, /*l2_hint=*/hint, /*l3_hint=*/hint); @@ -575,21 +600,23 @@ struct TransferWriteLowering if (!map.isMinorIdentity()) return rewriter.notifyMatchFailure(writeOp, "Expects identity map"); + auto [src, indices] = convertMemrefAndOffsetsToTargetRank( + rewriter, loc, writeOp.getBase(), + getAsOpFoldResult(writeOp.getIndices()), vecTy.getRank()); + auto descType = xegpu::TensorDescType::get( vecTy.getShape(), vecTy.getElementType(), /*array_length=*/1, /*boundary_check=*/writeOp.hasOutOfBoundsDim(), xegpu::MemorySpace::Global); - xegpu::CreateNdDescOp ndDesc = - createNdDescriptor(rewriter, loc, descType, - dyn_cast<TypedValue<MemRefType>>(writeOp.getBase()), - writeOp.getIndices()); - // By default, no specific caching policy is assigned. xegpu::CachePolicyAttr hint = nullptr; - auto storeOp = - xegpu::StoreNdOp::create(rewriter, loc, writeOp.getVector(), ndDesc, - /*l1_hint=*/hint, - /*l2_hint=*/hint, /*l3_hint=*/hint); + xegpu::CreateNdDescOp ndDesc = createNdDescriptor( + rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src)); + + auto storeOp = xegpu::StoreNdOp::create(rewriter, loc, writeOp.getVector(), + ndDesc, indices, + /*l1_hint=*/hint, + /*l2_hint=*/hint, /*l3_hint=*/hint); rewriter.replaceOp(writeOp, storeOp); return success(); @@ -621,7 +648,8 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> { /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); auto selectOp = arith::SelectOp::create(rewriter, loc, gatherOp.getMask(), @@ -655,7 +683,8 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> { /*chunk_size=*/IntegerAttr{}, /*l1_hint=*/xegpu::CachePolicyAttr{}, /*l2_hint=*/xegpu::CachePolicyAttr{}, - /*l3_hint=*/xegpu::CachePolicyAttr{}); + /*l3_hint=*/xegpu::CachePolicyAttr{}, + /*layout=*/nullptr); rewriter.eraseOp(scatterOp); return success(); } @@ -674,19 +703,24 @@ struct LoadLowering : public OpRewritePattern<vector::LoadOp> { // Boundary check is available only for block instructions. bool boundaryCheck = vecTy.getRank() > 1; + // By default, no specific caching policy is assigned. + xegpu::CachePolicyAttr hint = nullptr; + + auto [src, indices] = convertMemrefAndOffsetsToTargetRank( + rewriter, loc, loadOp.getBase(), getAsOpFoldResult(loadOp.getIndices()), + vecTy.getRank()); auto descType = xegpu::TensorDescType::get( vecTy.getShape(), vecTy.getElementType(), /*array_length=*/1, boundaryCheck, xegpu::MemorySpace::Global); - xegpu::CreateNdDescOp ndDesc = createNdDescriptor( - rewriter, loc, descType, loadOp.getBase(), loadOp.getIndices()); - // By default, no specific caching policy is assigned. - xegpu::CachePolicyAttr hint = nullptr; - auto loadNdOp = xegpu::LoadNdOp::create( - rewriter, loc, vecTy, ndDesc, /*packed=*/nullptr, /*transpose=*/nullptr, - /*l1_hint=*/hint, - /*l2_hint=*/hint, /*l3_hint=*/hint); + xegpu::CreateNdDescOp ndDesc = createNdDescriptor( + rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src)); + auto loadNdOp = + xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, indices, + /*packed=*/nullptr, /*transpose=*/nullptr, + /*l1_hint=*/hint, + /*l2_hint=*/hint, /*l3_hint=*/hint); rewriter.replaceOp(loadOp, loadNdOp); return success(); @@ -708,18 +742,24 @@ struct StoreLowering : public OpRewritePattern<vector::StoreOp> { // Boundary check is available only for block instructions. bool boundaryCheck = vecTy.getRank() > 1; + auto [src, indices] = convertMemrefAndOffsetsToTargetRank( + rewriter, loc, storeOp.getBase(), + getAsOpFoldResult(storeOp.getIndices()), vecTy.getRank()); + auto descType = xegpu::TensorDescType::get( vecTy.getShape(), vecTy.getElementType(), /*array_length=*/1, boundaryCheck, xegpu::MemorySpace::Global); - xegpu::CreateNdDescOp ndDesc = createNdDescriptor( - rewriter, loc, descType, storeOp.getBase(), storeOp.getIndices()); // By default, no specific caching policy is assigned. xegpu::CachePolicyAttr hint = nullptr; + xegpu::CreateNdDescOp ndDesc = createNdDescriptor( + rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src)); + auto storeNdOp = - xegpu::StoreNdOp::create(rewriter, loc, vector, ndDesc, + xegpu::StoreNdOp::create(rewriter, loc, vector, ndDesc, indices, /*l1_hint=*/hint, /*l2_hint=*/hint, /*l3_hint=*/hint); + rewriter.replaceOp(storeOp, storeNdOp); return success(); diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp index 33e8f2ed1f6ed..de552ce22ef62 100644 --- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp +++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp @@ -562,6 +562,8 @@ class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> { VectorType valOrResVecTy = dyn_cast<VectorType>(data.getType()); if (!valOrResVecTy) valOrResVecTy = VectorType::get(1, data.getType()); + if (valOrResVecTy.getShape().size() != 1) + return rewriter.notifyMatchFailure(op, "Expected 1D data vector."); int64_t elemBitWidth = valOrResVecTy.getElementType().getIntOrFloatBitWidth(); diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp index 4d2d8738aa4ad..3d1a73417d1ea 100644 --- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp @@ -66,9 +66,10 @@ static Value getSupportedReduction(AffineForOp forOp, unsigned pos, .Case([](arith::MaxSIOp) { return arith::AtomicRMWKind::maxs; }) .Case([](arith::MinUIOp) { return arith::AtomicRMWKind::minu; }) .Case([](arith::MaxUIOp) { return arith::AtomicRMWKind::maxu; }) + .Case([](arith::XOrIOp) { return arith::AtomicRMWKind::xori; }) + .Case([](arith::MaxNumFOp) { return arith::AtomicRMWKind::maxnumf; }) + .Case([](arith::MinNumFOp) { return arith::AtomicRMWKind::minnumf; }) .Default([](Operation *) -> std::optional<arith::AtomicRMWKind> { - // TODO: AtomicRMW supports other kinds of reductions this is - // currently not detecting, add those when the need arises. return std::nullopt; }); if (!maybeKind) diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index a5ffb9e77fa9d..d43f8815be16d 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -365,6 +365,59 @@ LogicalResult ConvertF4x2ToF16x2Op::verify() { return success(); } +//===----------------------------------------------------------------------===// +// Stochastic Rounding Conversion Ops +//===----------------------------------------------------------------------===// + +LogicalResult ConvertF32x2ToF16x2Op::verify() { + if (getRnd() != FPRoundingMode::RS) + return emitOpError("Only RS rounding mode is supported for " + "conversions from f32x2 to f16x2."); + return success(); +} + +LogicalResult ConvertF32x2ToBF16x2Op::verify() { + if (getRnd() != FPRoundingMode::RS) + return emitOpError("Only RS rounding mode is supported for " + "conversions from f32x2 to bf16x2."); + return success(); +} + +LogicalResult ConvertF32x4ToF8x4Op::verify() { + mlir::MLIRContext *ctx = getContext(); + + if (!llvm::isa<mlir::Float8E4M3FNType, mlir::Float8E5M2Type>(getDstTy())) + return emitOpError("Only ") + << mlir::Float8E4M3FNType::get(ctx) << " and " + << mlir::Float8E5M2Type::get(ctx) + << " types are supported for conversions from f32x4 to f8x4."; + + return success(); +} + +LogicalResult ConvertF32x4ToF6x4Op::verify() { + mlir::MLIRContext *ctx = getContext(); + + if (!llvm::isa<mlir::Float6E2M3FNType, mlir::Float6E3M2FNType>(getDstTy())) + return emitOpError("Only ") + << mlir::Float6E2M3FNType::get(ctx) << " and " + << mlir::Float6E3M2FNType::get(ctx) + << " types are supported for conversions from f32x4 to f6x4."; + + return success(); +} + +LogicalResult ConvertF32x4ToF4x4Op::verify() { + mlir::MLIRContext *ctx = getContext(); + + if (!llvm::isa<mlir::Float4E2M1FNType>(getDstTy())) + return emitOpError("Only ") << mlir::Float4E2M1FNType::get(ctx) + << " type is supported for conversions from " + "f32x4 to f4x4."; + + return success(); +} + LogicalResult BulkStoreOp::verify() { if (getInitVal() != 0) return emitOpError("only 0 is supported for initVal, got ") << getInitVal(); @@ -867,15 +920,40 @@ LogicalResult MmaOp::verify() { } LogicalResult ShflOp::verify() { - if (!(*this)->getAttrOfType<UnitAttr>("return_value_and_is_valid")) - return success(); - auto type = llvm::dyn_cast<LLVM::LLVMStructType>(getType()); - auto elementType = (type && type.getBody().size() == 2) - ? llvm::dyn_cast<IntegerType>(type.getBody()[1]) - : nullptr; - if (!elementType || elementType.getWidth() != 1) - return emitError("expected return type to be a two-element struct with " - "i1 as the second element"); + auto returnStructType = llvm::dyn_cast<LLVM::LLVMStructType>(getType()); + + auto verifyTypeError = [&](Twine desc, Type expectedType, + Type actualType) -> LogicalResult { + return emitOpError("expected " + desc + " to be of type ") + << expectedType << " but got " << actualType << " instead"; + }; + + if (returnStructType) { + if (!getReturnValueAndIsValid()) + return emitOpError("\"return_value_and_is_valid\" attribute must be " + "specified when the return type is a struct type"); + + if (returnStructType.getBody().size() != 2) + return emitOpError("expected return type to be a two-element struct"); + + llvm::ArrayRef<Type> returnStruct = returnStructType.getBody(); + auto resultType = returnStruct[0]; + if (resultType != getVal().getType()) + return verifyTypeError("first element in the returned struct", + getVal().getType(), resultType); + + auto predicateType = returnStruct[1]; + if (!predicateType.isInteger(1)) + return verifyTypeError("second element in the returned struct", + mlir::IntegerType::get(getContext(), 1), + predicateType); + } else { + if (getReturnValueAndIsValid()) + return emitOpError("expected return type to be a two-element struct"); + + if (getType() != getVal().getType()) + return verifyTypeError("return type", getVal().getType(), getType()); + } return success(); } @@ -1577,6 +1655,43 @@ LogicalResult NVVM::ClusterLaunchControlQueryCancelOp::verify() { return success(); } +LogicalResult NVVM::ReduxOp::verify() { + mlir::Type reduxType = getType(); + + if (!reduxType.isF32()) { + if (getAbs()) + return emitOpError("abs attribute is supported only for f32 type"); + if (getNan()) + return emitOpError("nan attribute is supported only for f32 type"); + } + + NVVM::ReduxKind kind = getKind(); + switch (kind) { + case NVVM::ReduxKind::ADD: + case NVVM::ReduxKind::AND: + case NVVM::ReduxKind::OR: + case NVVM::ReduxKind::XOR: + case NVVM::ReduxKind::MAX: + case NVVM::ReduxKind::MIN: + case NVVM::ReduxKind::UMAX: + case NVVM::ReduxKind::UMIN: + if (!reduxType.isInteger(32)) + return emitOpError("'") + << stringifyEnum(kind) << "' redux kind unsupported with " + << reduxType << " type. Only supported type is 'i32'."; + break; + case NVVM::ReduxKind::FMIN: + case NVVM::ReduxKind::FMAX: + if (!reduxType.isF32()) + return emitOpError("'") + << stringifyEnum(kind) << "' redux kind unsupported with " + << reduxType << " type. Only supported type is 'f32'."; + break; + } + + return success(); +} + /// Packs the given `field` into the `result`. /// The `result` is 64-bits and each `field` can be 32-bits or narrower. static llvm::Value * @@ -1637,15 +1752,21 @@ std::string NVVM::MBarrierInitOp::getPtx() { // getIntrinsicID/getIntrinsicIDAndArgs methods //===----------------------------------------------------------------------===// +static bool isPtrInAddrSpace(mlir::Value ptr, NVVMMemorySpace targetAS) { + auto ptrTy = llvm::cast<LLVM::LLVMPointerType>(ptr.getType()); + return ptrTy.getAddressSpace() == static_cast<unsigned>(targetAS); +} + +static bool isPtrInSharedCTASpace(mlir::Value ptr) { + return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared); +} + mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs( Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { auto thisOp = cast<NVVM::MBarrierInitOp>(op); - unsigned addressSpace = - llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType()) - .getAddressSpace(); - llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared) - ? llvm::Intrinsic::nvvm_mbarrier_init_shared - : llvm::Intrinsic::nvvm_mbarrier_init; + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + llvm::Intrinsic::ID id = isShared ? llvm::Intrinsic::nvvm_mbarrier_init_shared + : llvm::Intrinsic::nvvm_mbarrier_init; // Fill the Intrinsic Args llvm::SmallVector<llvm::Value *> args; @@ -1658,16 +1779,72 @@ mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs( mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs( Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { auto thisOp = cast<NVVM::MBarrierInvalOp>(op); - unsigned addressSpace = - llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType()) - .getAddressSpace(); - llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared) + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + llvm::Intrinsic::ID id = isShared ? llvm::Intrinsic::nvvm_mbarrier_inval_shared : llvm::Intrinsic::nvvm_mbarrier_inval; return {id, {mt.lookupValue(thisOp.getAddr())}}; } +mlir::NVVM::IDArgPair MBarrierArriveOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast<NVVM::MBarrierArriveOp>(op); + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + llvm::Intrinsic::ID id = isShared + ? llvm::Intrinsic::nvvm_mbarrier_arrive_shared + : llvm::Intrinsic::nvvm_mbarrier_arrive; + + return {id, {mt.lookupValue(thisOp.getAddr())}}; +} + +mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast<NVVM::MBarrierArriveNocompleteOp>(op); + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + llvm::Intrinsic::ID id = + isShared ? llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared + : llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete; + // Fill the Intrinsic Args + llvm::SmallVector<llvm::Value *> args; + args.push_back(mt.lookupValue(thisOp.getAddr())); + args.push_back(mt.lookupValue(thisOp.getCount())); + + return {id, std::move(args)}; +} + +mlir::NVVM::IDArgPair MBarrierTestWaitOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast<NVVM::MBarrierTestWaitOp>(op); + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + llvm::Intrinsic::ID id = isShared + ? llvm::Intrinsic::nvvm_mbarrier_test_wait_shared + : llvm::Intrinsic::nvvm_mbarrier_test_wait; + // Fill the Intrinsic Args + llvm::SmallVector<llvm::Value *> args; + args.push_back(mt.lookupValue(thisOp.getAddr())); + args.push_back(mt.lookupValue(thisOp.getState())); + + return {id, std::move(args)}; +} + +mlir::NVVM::IDArgPair CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast<NVVM::CpAsyncMBarrierArriveOp>(op); + bool isShared = isPtrInSharedCTASpace(thisOp.getAddr()); + + llvm::Intrinsic::ID id; + if (thisOp.getNoinc()) { + id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared + : llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc; + } else { + id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared + : llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive; + } + + return {id, {mt.lookupValue(thisOp.getAddr())}}; +} + #define CP_ASYNC_ID_IMPL(mod, size, suffix) \ llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix @@ -2469,6 +2646,85 @@ Tcgen05CommitOp::getIntrinsicIDAndArgs(Operation &op, return TCGEN05_CP_2CTA(shape_mc, , is_2cta); \ }() +llvm::Intrinsic::ID ConvertF32x2ToF16x2Op::getIntrinsicID() { + bool hasRelu = getRelu(); + bool hasSatFinite = (getSat() == NVVM::SaturationMode::SATFINITE); + + if (hasRelu && hasSatFinite) + return llvm::Intrinsic::nvvm_ff2f16x2_rs_relu_satfinite; + if (hasRelu) + return llvm::Intrinsic::nvvm_ff2f16x2_rs_relu; + if (hasSatFinite) + return llvm::Intrinsic::nvvm_ff2f16x2_rs_satfinite; + return llvm::Intrinsic::nvvm_ff2f16x2_rs; +} + +llvm::Intrinsic::ID ConvertF32x2ToBF16x2Op::getIntrinsicID() { + bool hasRelu = getRelu(); + bool hasSatFinite = (getSat() == NVVM::SaturationMode::SATFINITE); + + if (hasRelu && hasSatFinite) + return llvm::Intrinsic::nvvm_ff2bf16x2_rs_relu_satfinite; + if (hasRelu) + return llvm::Intrinsic::nvvm_ff2bf16x2_rs_relu; + if (hasSatFinite) + return llvm::Intrinsic::nvvm_ff2bf16x2_rs_satfinite; + return llvm::Intrinsic::nvvm_ff2bf16x2_rs; +} + +llvm::Intrinsic::ID ConvertF32x4ToF8x4Op::getIntrinsicID() { + mlir::Type dstTy = getDstTy(); + bool hasRelu = getRelu(); + + return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy) + .Case<mlir::Float8E4M3FNType>([&](mlir::Float8E4M3FNType) { + return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite + : llvm::Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite; + }) + .Case<mlir::Float8E5M2Type>([&](mlir::Float8E5M2Type) { + return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite + : llvm::Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite; + }) + .Default([](mlir::Type) { + llvm_unreachable("Invalid F8 type in ConvertF32x4ToF8x4Op"); + return llvm::Intrinsic::not_intrinsic; + }); +} + +llvm::Intrinsic::ID ConvertF32x4ToF6x4Op::getIntrinsicID() { + mlir::Type dstTy = getDstTy(); + bool hasRelu = getRelu(); + + return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy) + .Case<mlir::Float6E2M3FNType>([&](mlir::Float6E2M3FNType) { + return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite + : llvm::Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite; + }) + .Case<mlir::Float6E3M2FNType>([&](mlir::Float6E3M2FNType) { + return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite + : llvm::Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite; + }) + .Default([](mlir::Type) { + llvm_unreachable("Invalid F6 type in ConvertF32x4ToF6x4Op"); + return llvm::Intrinsic::not_intrinsic; + }); +} + +llvm::Intrinsic::ID ConvertF32x4ToF4x4Op::getIntrinsicID() { + mlir::Type dstTy = getDstTy(); + bool hasRelu = getRelu(); + + return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy) + .Case<mlir::Float4E2M1FNType>([&](mlir::Float4E2M1FNType) { + return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite + : llvm::Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite; + }) + .Default([](mlir::Type) { + llvm_unreachable("Invalid F4 type in ConvertF32x4ToF4x4Op"); + return llvm::Intrinsic::not_intrinsic; + }); +} + llvm::Intrinsic::ID Tcgen05CpOp::getIntrinsicID(Operation &op) { auto curOp = cast<NVVM::Tcgen05CpOp>(op); bool is2CTA = curOp.getGroup() == CTAGroupKind::CTA_2; @@ -2508,6 +2764,9 @@ LogicalResult Tcgen05LdOp::verify() { if (getShape() == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && !getOffset()) result = emitError("shape 16x32bx2 requires offset argument"); + if (getShape() != NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && getOffset()) + result = emitError("offset argument is only supported for shape 16x32bx2"); + auto resTy = getRes().getType(); unsigned resLen = isa<VectorType>(resTy) ? llvm::cast<VectorType>(resTy).getNumElements() diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 35eba724a9059..b2f1d840f3bca 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -3068,8 +3068,12 @@ LogicalResult acc::LoopOp::verify() { if (getRegion().empty()) return emitError("expected non-empty body."); - // When it is container-like - it is expected to hold a loop-like operation. - if (isContainerLike()) { + if (getUnstructured()) { + if (!isContainerLike()) + return emitError( + "unstructured acc.loop must not have induction variables"); + } else if (isContainerLike()) { + // When it is container-like - it is expected to hold a loop-like operation. // Obtain the maximum collapse count - we use this to check that there // are enough loops contained. uint64_t collapseCount = getCollapseValue().value_or(1); diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index 2946b53c8cb36..881e256a8797b 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -2565,6 +2565,39 @@ struct ConvertTrivialIfToSelect : public OpRewritePattern<IfOp> { struct ConditionPropagation : public OpRewritePattern<IfOp> { using OpRewritePattern<IfOp>::OpRewritePattern; + /// Kind of parent region in the ancestor cache. + enum class Parent { Then, Else, None }; + + /// Returns the kind of region ("then", "else", or "none") of the + /// IfOp that the given region is transitively nested in. Updates + /// the cache accordingly. + static Parent getParentType(Region *toCheck, IfOp op, + DenseMap<Region *, Parent> &cache, + Region *endRegion) { + SmallVector<Region *> seen; + while (toCheck != endRegion) { + auto found = cache.find(toCheck); + if (found != cache.end()) + return found->second; + seen.push_back(toCheck); + if (&op.getThenRegion() == toCheck) { + for (Region *region : seen) + cache[region] = Parent::Then; + return Parent::Then; + } + if (&op.getElseRegion() == toCheck) { + for (Region *region : seen) + cache[region] = Parent::Else; + return Parent::Else; + } + toCheck = toCheck->getParentRegion(); + } + + for (Region *region : seen) + cache[region] = Parent::None; + return Parent::None; + } + LogicalResult matchAndRewrite(IfOp op, PatternRewriter &rewriter) const override { // Early exit if the condition is constant since replacing a constant @@ -2580,9 +2613,12 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> { Value constantTrue = nullptr; Value constantFalse = nullptr; + DenseMap<Region *, Parent> cache; for (OpOperand &use : llvm::make_early_inc_range(op.getCondition().getUses())) { - if (op.getThenRegion().isAncestor(use.getOwner()->getParentRegion())) { + switch (getParentType(use.getOwner()->getParentRegion(), op, cache, + op.getCondition().getParentRegion())) { + case Parent::Then: { changed = true; if (!constantTrue) @@ -2591,8 +2627,9 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> { rewriter.modifyOpInPlace(use.getOwner(), [&]() { use.set(constantTrue); }); - } else if (op.getElseRegion().isAncestor( - use.getOwner()->getParentRegion())) { + break; + } + case Parent::Else: { changed = true; if (!constantFalse) @@ -2601,6 +2638,10 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> { rewriter.modifyOpInPlace(use.getOwner(), [&]() { use.set(constantFalse); }); + break; + } + case Parent::None: + break; } } diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index ae3423c40040d..daef0ba02100a 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -717,7 +717,15 @@ Value mlir::vector::getVectorReductionOp(arith::AtomicRMWKind op, case arith::AtomicRMWKind::ori: return vector::ReductionOp::create(builder, vector.getLoc(), CombiningKind::OR, vector); - // TODO: Add remaining reduction operations. + case arith::AtomicRMWKind::minnumf: + return vector::ReductionOp::create(builder, vector.getLoc(), + CombiningKind::MINNUMF, vector); + case arith::AtomicRMWKind::maxnumf: + return vector::ReductionOp::create(builder, vector.getLoc(), + CombiningKind::MAXNUMF, vector); + case arith::AtomicRMWKind::xori: + return vector::ReductionOp::create(builder, vector.getLoc(), + CombiningKind::XOR, vector); default: (void)emitOptionalError(loc, "Reduction operation type not supported"); break; diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 83406c8c75dcf..fb5d1e758dbd1 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -37,55 +37,61 @@ void XeGPUDialect::initialize() { >(); } -/// Generates instructions to compute offsets for a subgroup identified by -/// its multidimensional indices (sgId), using the specified subgroup layout -/// (sgLayout), subgroup data dimensions (sizePerSg), and the overall data -/// dimensions (sizePerWg). +// A `srcShape` consists of N distribution units, each being `subShapesLayout` x +// `subShape`. A `delinearizedId` is used to identify a particular `subShape` +// within each distribution unit. +// Example: +// WG data is 128x256. SG data is 16x32, in 4x2 layout, this gives a +// distribution unit of shape 64x64, we have 2x4 such distribution units. +// `delinearizedId` is used to identify a 16x32 of a subgroup in each +// distribution unit. static SmallVector<SmallVector<Value>> -genOffsetsComputingInsts(OpBuilder &builder, Location loc, - SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout, - ArrayRef<int64_t> sizePerSg, - ArrayRef<int64_t> sizePerWg) { - - SmallVector<SmallVector<Value>> offsets; +genCoordinates(OpBuilder &builder, Location loc, + SmallVector<Value> delinearizedId, + ArrayRef<int64_t> subShapesLayout, ArrayRef<int64_t> subShape, + ArrayRef<int64_t> srcShape) { + SmallVector<SmallVector<Value>> coordinates; + + // A distribution unit must be less than or equal to `srcShape` + SmallVector<int64_t> distUnitShape = llvm::map_to_vector( + llvm::zip_equal(srcShape, + computeElementwiseMul(subShapesLayout, subShape)), + [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); }); - // nd local offset, localOffset[i] = sgId[i] * sizePerSg[i] - SmallVector<Value> localOffsets = llvm::map_to_vector( - llvm::zip(sgId, sizePerSg), [&](const auto &t) -> Value { + // Get the offset of `subShape` within a distribution unit. + SmallVector<Value> distUnitLocalOffset = llvm::map_to_vector( + llvm::zip(delinearizedId, subShape), [&](const auto &t) -> Value { return builder.createOrFold<index::MulOp>( loc, std::get<0>(t), builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t))); }); - // distUnit[i] is the minimum value between sizePerWg[i] and - // sgLayout[i] * sizePerSg[i] - SmallVector<int64_t> distUnit = llvm::map_to_vector( - llvm::zip_equal(sizePerWg, computeElementwiseMul(sgLayout, sizePerSg)), - [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); }); - + // For each dist unit for (SmallVector<int64_t> unitOffs : - StaticTileOffsetRange(sizePerWg, distUnit)) { + StaticTileOffsetRange(srcShape, distUnitShape)) { + // Get dist unit offset within `srcShape`. SmallVector<Value> base = llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value { return arith::ConstantIndexOp::create(builder, loc, d); }); - - SmallVector<Value> adds = llvm::map_to_vector( - llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value { - return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t), - std::get<1>(t)); - }); - + // Calculate `subShape` offset within `srcShape`. + SmallVector<Value> adds = + llvm::map_to_vector(llvm::zip_equal(base, distUnitLocalOffset), + [&](const auto &t) -> Value { + return builder.createOrFold<arith::AddIOp>( + loc, std::get<0>(t), std::get<1>(t)); + }); + // Do not go beyond `srcShape` bounds. SmallVector<Value> mods = llvm::map_to_vector( - llvm::zip_equal(adds, sizePerWg), [&](const auto &t) -> Value { + llvm::zip_equal(adds, srcShape), [&](const auto &t) -> Value { return builder.createOrFold<index::RemUOp>( loc, std::get<0>(t), arith::ConstantIndexOp::create(builder, loc, std::get<1>(t))); }); - offsets.push_back(mods); + coordinates.push_back(mods); } - return offsets; + return coordinates; } // Checks if the given shape can be evenly distributed based on the layout @@ -272,56 +278,117 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError, } FailureOr<SmallVector<Value>> -LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc, - Value linearId) { - // delinearizeSubgroupId is only available for - // workgroup-level layout attribute - if (!isForWorkgroup()) +LayoutAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) { + + SmallVector<int64_t> sgLayoutInt; + if (isForWorkgroup()) { + sgLayoutInt = getEffectiveSgLayoutAsInt(); + } else if (isForSubgroup()) { + sgLayoutInt = getEffectiveLaneLayoutAsInt(); + } else { return failure(); + } - // TODO: handle order attribute - auto hasDefaultOrder = [&]() { - DenseI32ArrayAttr order = getOrder(); - return !order || isIdentityPermutation(llvm::to_vector_of<int64_t>( - llvm::reverse(order.asArrayRef()))); - }; - if (!hasDefaultOrder()) - return mlir::emitError(loc, "order attribute is currently not supported."); + DenseI32ArrayAttr orderAttr = getOrder(); - auto dims = - llvm::map_to_vector(getEffectiveSgLayoutAsInt(), [&](int64_t d) -> Value { - return builder.createOrFold<arith::ConstantIndexOp>(loc, d); - }); + // Handle order attribute + SmallVector<int64_t> order; + if (orderAttr && !orderAttr.empty()) { + order = llvm::to_vector( + llvm::map_range(orderAttr.asArrayRef(), + [](int32_t idx) { return static_cast<int64_t>(idx); })); + } else { + // Default order: [1, 0] for 2D (row-major), [2, 1, 0] for 3D, etc. + order = llvm::to_vector( + llvm::reverse(llvm::seq<int64_t>(0, sgLayoutInt.size()))); + } - return affine::delinearizeIndex(builder, loc, linearId, dims); + if (order.size() != sgLayoutInt.size()) { + return failure(); + } + + SmallVector<Value> result(sgLayoutInt.size()); + Value remaining = linearId; + + /// Process dimensions in the order they appear in the order array + /// The first dimension in order is the fastest-changing + /// + /// Example walkthrough for linearId=22, sgLayout=[2,4,4], order=[2,1,0]: + /// + /// Initial: remaining=22, dimIdx = order[i], dimSize = sgLayout[dimIdx], + /// result=[?,?,?] + /// + /// i=0 (process columns, dimIdx=2, dimSize=4): + /// result[2] = 22 % 4 = 2 (column coordinate) + /// remaining = 22 / 4 = 5 (5 complete groups of 4 columns processed) + /// + /// i=1 (process rows, dimIdx=1, dimSize=4): + /// result[1] = 5 % 4 = 1 (row coordinate) + /// remaining = 5 / 4 = 1 (1 complete group of 4 rows processed) + /// + /// i=2 (process layers, dimIdx=0, dimSize=2): + /// result[0] = 1 % 2 = 1 (layer coordinate) + /// (no remaining update - last iteration) + /// + /// Final result: [1,1,2] = Layer 1, Row 1, Column 2 + for (size_t i = 0; i < order.size(); ++i) { + int64_t dimIdx = order[i]; + int64_t dimSize = sgLayoutInt[dimIdx]; + + Value dimSizeVal = + builder.createOrFold<arith::ConstantIndexOp>(loc, dimSize); + + /// Extract the coordinate for this dimension using modulo operation + /// This gives us "how far within this dimension" we are + /// e.g., linearId=22, dimSize=4: 22 % 4 = 2 (we're at position 2 within + /// this dimension) + result[dimIdx] = + builder.createOrFold<index::RemUOp>(loc, remaining, dimSizeVal); + + /// Update remaining for the next dimension by removing what we've already + /// processed. Division tells us "how many complete groups of this dimension + /// we've gone through" e.g., linearId=22, dimSize=4: 22 / 4 = 5 (we've + /// completed 5 groups of 4) Skip this for the last iteration since there's + /// no next dimension to process + if (i < order.size() - 1) { + remaining = + builder.createOrFold<index::DivUOp>(loc, remaining, dimSizeVal); + } + } + return result; } -/// Implements DistributeLayoutAttr::getOffsets to generate +/// Implements DistributeLayoutAttr::computeDistributedCoords to generate /// instructions for computing multi-dimensional offsets when distributed by /// LayoutAttr. FailureOr<SmallVector<SmallVector<Value>>> -LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId, - ArrayRef<int64_t> shape) { - if (!isForWorkgroup()) +LayoutAttr::computeDistributedCoords(OpBuilder &builder, Location loc, + Value linearId, ArrayRef<int64_t> shape) { + SmallVector<int64_t> layout; + SmallVector<int64_t> subShape; + if (isForWorkgroup()) { + layout = getEffectiveSgLayoutAsInt(); + subShape = getEffectiveSgDataAsInt(); + } else if (isForSubgroup()) { + layout = getEffectiveLaneLayoutAsInt(); + subShape = getEffectiveLaneDataAsInt(); + } else { return failure(); - - SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt(); - SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt(); - if (sgShape.empty()) { - if (auto derivedShape = computeShapeRatio(shape, sgLayout)) - sgShape = derivedShape.value(); + } + if (subShape.empty()) { + if (auto derivedShape = computeShapeRatio(shape, layout)) + subShape = derivedShape.value(); else return failure(); } // delinearize Ids - auto maybeIds = delinearizeSubgroupId(builder, loc, linearId); + auto maybeIds = delinearizeId(builder, loc, linearId); if (failed(maybeIds)) return failure(); - SmallVector<Value> sgIds = *maybeIds; + SmallVector<Value> ids = *maybeIds; - return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape, - shape); + return genCoordinates(builder, loc, ids, layout, subShape, shape); } //===----------------------------------------------------------------------===// @@ -375,34 +442,43 @@ SliceAttr SliceAttr::flatten() const { } FailureOr<SmallVector<Value>> -SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc, - Value linearId) { +SliceAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) { SliceAttr attr = flatten(); auto parent = dyn_cast<LayoutAttr>(attr.getParent()); - return parent.delinearizeSubgroupId(builder, loc, linearId); + return parent.delinearizeId(builder, loc, linearId); } -/// Implements DistributeLayoutAttr::getOffsets to generate -/// instructions for computing multi-dimensional offsets when distributed by -/// SliceAttr. +// Implements DistributeLayoutAttr::computeDistributedCoords to generate +// instructions for computing multi-dimensional offsets when distributed by +// LayoutAttr. FailureOr<SmallVector<SmallVector<Value>>> -SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId, - ArrayRef<int64_t> shape) { +SliceAttr::computeDistributedCoords(OpBuilder &builder, Location loc, + Value linearId, ArrayRef<int64_t> shape) { assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape."); if (!isForWorkgroup()) return failure(); - SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt(); - SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt(); - if (sgShape.empty()) { - if (auto derivedShape = computeShapeRatio(shape, sgLayout)) - sgShape = derivedShape.value(); + SmallVector<int64_t> layout; + SmallVector<int64_t> subShape; + if (isForWorkgroup()) { + layout = getEffectiveSgLayoutAsInt(); + subShape = getEffectiveSgDataAsInt(); + } else if (isForSubgroup()) { + layout = getEffectiveLaneLayoutAsInt(); + subShape = getEffectiveLaneDataAsInt(); + } else { + return failure(); + } + + if (subShape.empty()) { + if (auto derivedShape = computeShapeRatio(shape, layout)) + subShape = derivedShape.value(); else return failure(); } // delinearize Ids - auto maybeIds = delinearizeSubgroupId(builder, loc, linearId); + auto maybeIds = delinearizeId(builder, loc, linearId); if (failed(maybeIds)) return failure(); @@ -412,8 +488,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId, SmallVector<Value> sgIds = XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims); - return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape, - shape); + return genCoordinates(builder, loc, sgIds, layout, subShape, shape); } bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) { diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index abd12e2e69ac0..4dd10bedc6d84 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -175,13 +175,13 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy, LogicalResult IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy, - UnitAttr subgroup_block_io, + UnitAttr subgroup_block_io, DistributeLayoutAttr layout, function_ref<InFlightDiagnostic()> emitError) { if (!dataTy) { if (subgroup_block_io) return emitError() << "subgroup_block_io " - "are only allowed when result is a 1D VectorType."; + "are only allowed when result is a VectorType."; else return success(); } @@ -192,15 +192,37 @@ IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy, ArrayRef<int64_t> dataShape = dataTy.getShape(); ArrayRef<int64_t> mdescShape = mdescTy.getShape(); + SmallVector<int64_t> blockShape = mdescTy.getBlockShape(); + ArrayAttr strideAttr = mdescTy.getStrideAttr(); + SmallVector<int64_t> strides; + for (Attribute attr : strideAttr.getValue()) { + strides.push_back(cast<IntegerAttr>(attr).getInt()); + } + if (subgroup_block_io && layout) { + auto laneData = layout.getEffectiveLaneDataAsInt(); + auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); + if (!laneData.empty()) { + bool isLaneDataContiguous = + std::all_of(laneData.begin(), std::prev(laneData.end()), + [](int x) { return x == 1; }); + if (!isLaneDataContiguous) + return emitError() << "With subgroup_block_io, accessed data must be " + "contiguous and coalesced."; + for (size_t i = 0; i < laneData.size(); ++i) { + if (laneLayout[i] != blockShape[i]) + return emitError() << "With subgroup_block_io, the block shape must " + "match the lane layout."; + if (laneLayout[i] != 1 && strides[i] != 1) + return emitError() << "With subgroup_block_io, the distributed " + "dimensions must be contiguous."; + } + } + } if (dataShape.size() == 2) { - if (subgroup_block_io) - return emitError() << "subgroup_block_io " - "are only allowed when result is a 1D VectorType."; if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape), [](auto p) { return std::get<0>(p) > std::get<1>(p); })) return emitError() << "data shape must not exceed mem_desc shape."; } else { - SmallVector<int64_t> blockShape = mdescTy.getBlockShape(); // if the subgroup_block_io attribute is set, mdescTy must have block // attribute if (subgroup_block_io && !blockShape.size()) @@ -258,8 +280,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, auto [memrefStrides, _] = memrefTy.getStridesAndOffset(); // if shape and strides are from Memref, we don't need attributes for them - // to keep the IR print clean. - if (staticShape == memrefShape && staticStrides == memrefStrides) { + // to keep the IR print clean (only do so for full-static case, otherwise + // printer would fail trying to print empty array-attr). + if (staticShape == memrefShape && staticStrides == memrefStrides && + dynamicShape.empty() && dynamicStrides.empty()) { staticShapeAttr = DenseI64ArrayAttr(); staticStridesAttr = DenseI64ArrayAttr(); } @@ -320,8 +344,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, auto [memrefStrides, _] = memrefTy.getStridesAndOffset(); // if shape and strides are from Memref, we don't need attributes for them - // to keep the IR print clean. - if (staticShape == memrefShape && staticStrides == memrefStrides) { + // to keep the IR print clean (only do so for full-static case, otherwise + // printer would fail trying to print empty array-attr). + if (staticShape == memrefShape && staticStrides == memrefStrides && + dynamicShape.empty() && dynamicStrides.empty()) { staticShapeAttr = DenseI64ArrayAttr(); staticStridesAttr = DenseI64ArrayAttr(); } @@ -472,11 +498,8 @@ LogicalResult PrefetchNdOp::verify() { return emitOpError("invalid l3_hint: ") << getL3HintAttr(); int64_t tDescRank = tdescTy.getRank(); - int64_t offsetSize = static_cast<int64_t>(getOffsets().size()); - int64_t constOffsetSize = - getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0; - if (((offsetSize != 0) && (offsetSize != tDescRank)) || - ((constOffsetSize != 0) && (constOffsetSize != tDescRank))) + int64_t offsetSize = getMixedOffsets().size(); + if (offsetSize != 0 && offsetSize != tDescRank) return emitOpError( "Mismatched ranks between offsets and tensor descriptor"); @@ -597,11 +620,8 @@ LogicalResult LoadNdOp::verify() { << tdescTy; int64_t tDescRank = tdescTy.getRank(); - int64_t offsetSize = static_cast<int64_t>(getOffsets().size()); - int64_t constOffsetSize = - getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0; - if (((offsetSize != 0) && (offsetSize != tDescRank)) || - ((constOffsetSize != 0) && (constOffsetSize != tDescRank))) + int64_t offsetSize = getMixedOffsets().size(); + if (offsetSize != 0 && offsetSize != tDescRank) return emitOpError( "Mismatched ranks between offsets and tensor descriptor"); @@ -691,11 +711,8 @@ LogicalResult StoreNdOp::verify() { << dstTy; int64_t tDescRank = dstTy.getRank(); - int64_t offsetSize = static_cast<int64_t>(getOffsets().size()); - int64_t constOffsetSize = - getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0; - if (((offsetSize != 0) && (offsetSize != tDescRank)) || - ((constOffsetSize != 0) && (constOffsetSize != tDescRank))) + int64_t offsetSize = getMixedOffsets().size(); + if (offsetSize != 0 && offsetSize != tDescRank) return emitOpError( "Mismatched ranks between offsets and tensor descriptor"); @@ -859,7 +876,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, xegpu::CachePolicyAttr l2_hint, xegpu::CachePolicyAttr l3_hint) { build(builder, state, valueType, source, Value(), mask, IntegerAttr(), - l1_hint, l2_hint, l3_hint); + l1_hint, l2_hint, l3_hint, /*layout=*/nullptr); } void LoadGatherOp::build(OpBuilder &builder, OperationState &state, @@ -875,7 +892,24 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state, auto offset = vector::FromElementsOp::create(builder, loc, type, values); build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint, - l2_hint, l3_hint); + l2_hint, l3_hint, /*layout=*/nullptr); +} + +void LoadGatherOp::build(OpBuilder &builder, OperationState &state, + Type valueType, Value source, + ArrayRef<OpFoldResult> offsets, Value mask, + IntegerAttr chunk_size, xegpu::CachePolicyAttr l1_hint, + xegpu::CachePolicyAttr l2_hint, + xegpu::CachePolicyAttr l3_hint, + xegpu::LayoutAttr layout) { + auto loc = source.getLoc(); + int64_t size = static_cast<int64_t>(offsets.size()); + auto type = VectorType::get(size, builder.getIndexType()); + auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets); + auto offset = vector::FromElementsOp::create(builder, loc, type, values); + + build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint, + l2_hint, l3_hint, layout); } //===----------------------------------------------------------------------===// @@ -926,7 +960,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state, xegpu::CachePolicyAttr l2_hint, xegpu::CachePolicyAttr l3_hint) { build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint, - l2_hint, l3_hint); + l2_hint, l3_hint, /*layout=*/nullptr); } void StoreScatterOp::build(OpBuilder &builder, OperationState &state, @@ -944,7 +978,23 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state, // Call the correct builder overload that does not expect result types. build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint, - l3_hint); + l3_hint, /*layout=*/nullptr); +} + +void StoreScatterOp::build( + OpBuilder &builder, OperationState &state, Value value, Value dest, + ArrayRef<OpFoldResult> offsets, Value mask, IntegerAttr chunk_size, + xegpu::CachePolicyAttr l1_hint, xegpu::CachePolicyAttr l2_hint, + xegpu::CachePolicyAttr l3_hint, xegpu::LayoutAttr layout) { + auto loc = dest.getLoc(); + int64_t size = static_cast<int64_t>(offsets.size()); + auto type = VectorType::get(size, builder.getIndexType()); + auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets); + auto offset = vector::FromElementsOp::create(builder, loc, type, values); + + // Call the correct builder overload that does not expect result types. + build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint, + l3_hint, layout); } //===----------------------------------------------------------------------===// @@ -1105,7 +1155,7 @@ LogicalResult LoadMatrixOp::verify() { MemDescType mdescTy = getMemDesc().getType(); return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io, - [&]() { return emitError(); }); + getLayoutAttr(), [&]() { return emitError(); }); } //===----------------------------------------------------------------------===// @@ -1129,7 +1179,7 @@ LogicalResult StoreMatrixOp::verify() { UnitAttr subgroup_block_io = getSubgroupBlockIoAttr(); MemDescType mdescTy = getMemDesc().getType(); return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io, - [&]() { return emitError(); }); + getLayoutAttr(), [&]() { return emitError(); }); } namespace mlir { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt index e6f76067094ce..29b645feab2c6 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -6,6 +6,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms XeGPUWgToSgDistribute.cpp XeGPUPropagateLayout.cpp XeGPUVectorLinearize.cpp + XeGPUOptimizeBlockLoads.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp new file mode 100644 index 0000000000000..4dc5ea4f7bb24 --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp @@ -0,0 +1,490 @@ +//===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/Transforms/Patterns.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/Dialect/XeGPU/Transforms/Passes.h" +#include "mlir/Dialect/XeGPU/Transforms/Transforms.h" +#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" +#include "mlir/Dialect/XeGPU/uArch/uArchBase.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/IR/Types.h" +#include "mlir/IR/Value.h" +#include "mlir/Transforms/DialectConversion.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include <optional> + +namespace mlir { +namespace xegpu { +#define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS +#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" +} // namespace xegpu +} // namespace mlir + +#define DEBUG_TYPE "xegpu-optimize-block-loads" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") + +using namespace mlir; + +namespace { + +/// Get the 2D lane data from a tensor desc type if it exists. +static std::optional<SmallVector<int64_t>> +getMaybeLaneData(xegpu::TensorDescType tdescType) { + auto layout = tdescType.getLayoutAttr(); + if (!layout) + return std::nullopt; + auto laneData = layout.getEffectiveLaneDataAsInt(); + if (laneData.size() != 2) + return std::nullopt; + return laneData; +} + +/// Get the 2D lane layout from a tensor desc type if it exists. +static std::optional<SmallVector<int64_t>> +getMaybeLaneLayout(xegpu::TensorDescType tdescType) { + auto layout = tdescType.getLayoutAttr(); + if (!layout) + return std::nullopt; + auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); + if (laneLayout.size() != 2) + return std::nullopt; + return laneLayout; +} + +/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 && +/// lane[1] == 1), but inner lane data is not equal to [1, 1]. +/// Example: +/// !xegpu.tensor_desc<16x16xf16, +/// #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>> +/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form) +/// indicating that this is a load that requires transpose effect. However, +/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from +/// the inner dimension. We convert this to a optimized form by converting the +/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the +/// later lowering easily use the load with transpose instruction. +static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout, + ArrayRef<int64_t> laneData) { + if (laneLayout.size() != 2 || laneData.size() != 2) + return false; + if (laneLayout[0] == 1 || laneLayout[1] != 1) + return false; + if (laneData[0] != 1 || laneData[1] == 1) + return false; + return true; +} + +/// A tensor desc type can be optimized if its element type is less than 32 bits +/// and its layout can be optimized. +static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) { + // If the dtype is greater or equal to 32 bits, layout must be valid. + int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); + if (elementTyBitwidth >= 32) + return false; + auto maybeLaneLayout = getMaybeLaneLayout(tdescType); + auto maybeLaneData = getMaybeLaneData(tdescType); + if (!maybeLaneData || !maybeLaneLayout) + return false; + return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData); +} + +/// Check if a tensor desc type can be optimized for transpose, if so return the +/// new optimized tensor desc type with a valid transpose layout. +static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType, + const uArch *targetuArch) { + if (!canBeOptimizedForTranspose(tdescType)) + return tdescType; + auto laneData = getMaybeLaneData(tdescType) + .value(); // Lane data must exist if we reach here. + int64_t innerLaneData = laneData[1]; + int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth(); + // Required shape is total shape of the vector result that this tensor desc + // must eventually load after adjusting for the new bitwidth and array + // length. + SmallVector<int64_t> requiredShape(tdescType.getShape()); + requiredShape.back() = + requiredShape.back() * tdescType.getArrayLength() / innerLaneData; + int newBitWidth = elementTyBitwidth * innerLaneData; + Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth); + // Supported shape is the max transpose shape that can be supported by + // hardware that is less than or equal to required shape. + auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>( + targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad)); + auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount( + newElemTy, /** has transform */ false, /** has transpose */ true); + // If no HW params found, return the original type. + if (!maybeHWParams) + return tdescType; + auto [widths, heights, counts] = maybeHWParams.value(); + // TODO: Currently we expect array length to be 1 for transpose case. + if (counts.size() != 1 || counts[0] != 1) + return tdescType; + int arrayLen = counts[0]; + int supportedHeight = + xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights); + int supportedWidth = + xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths); + // If no supported height or width found, return the original type. + if (supportedHeight == -1 || supportedWidth == -1) + return tdescType; + + SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth}; + xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get( + tdescType.getContext(), + tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1}); + // Array length can not be larger than 1 for transpose case. + return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen, + tdescType.getBoundaryCheck(), + tdescType.getMemorySpace(), newLayout); +} + +/// Helper to convert an OpFoldResult to Value. +static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc, + OpFoldResult ofr) { + std::optional<int64_t> mayBeInt = getConstantIntValue(ofr); + if (mayBeInt) + return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult(); + return llvm::cast<Value>(ofr); +} + +/// Helper to divide a Value by a constant integer. +static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc, + Value val, int64_t constant) { + // If the constant is a power of 2, use right shift for division. + if (llvm::isPowerOf2_64(constant)) { + int64_t shiftAmount = llvm::Log2_64(constant); + return arith::ShRUIOp::create( + rewriter, loc, val, + arith::ConstantIndexOp::create(rewriter, loc, shiftAmount) + .getResult()) + .getResult(); + } + auto constantOp = + arith::ConstantIndexOp::create(rewriter, loc, constant).getResult(); + return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult(); +} + +/// This function takes a larger register block `data` and generates multiple +/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block +/// starting from `offsets`. +static Value generateLoads(ConversionPatternRewriter &rewriter, + TypedValue<VectorType> data, + SmallVector<OpFoldResult> offsets, + TypedValue<xegpu::TensorDescType> newTensorDesc, + xegpu::LoadNdOp origLoadOp) { + Location loc = data.getLoc(); + assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp"); + Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]); + Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]); + SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape()); + // Compute the ratio between original shape and supported shape. We need to + // generate loads in this ratio arrangement. + auto shapeRatio = computeShapeRatio(data.getType().getShape(), + supportedShape) + .value(); // `ratio` must be defined if we reach here. + for (int64_t h = 0; h < shapeRatio[0]; ++h) { + for (int64_t w = 0; w < shapeRatio[1]; ++w) { + int64_t localOffsetDim0 = h * supportedShape[0]; + int64_t localOffsetDim1 = w * supportedShape[1]; + Value loadOffsetX = arith::AddIOp::create( + rewriter, loc, offsetDim0, + arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0) + .getResult()); + Value loadOffsetY = arith::AddIOp::create( + rewriter, loc, offsetDim1, + arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1) + .getResult()); + auto loadOp = xegpu::LoadNdOp::create( + rewriter, loc, + VectorType::get(supportedShape, data.getType().getElementType()), + newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY}, + origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(), + origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(), + origLoadOp.getL3HintAttr()); + // Set the layout for the loadOp. + auto layoutAttr = newTensorDesc.getType().getLayoutAttr(); + xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr); + // Insert the loaded block into the right position in data. + auto insertOp = vector::InsertStridedSliceOp::create( + rewriter, loc, loadOp.getResult(), data, + ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1}, + ArrayRef<int64_t>{1, 1}); + // InsertOp must have the same layout as newTensorDesc. + xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr); + data = insertOp.getResult(); + } + } + return data; +} + +/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a +/// new CreateNdDescOp with optimized tensor desc type. This involves extracting +/// the base pointer from the original memory source and adjusting the shape and +/// strides of the tensor desc to fit with the new optimized transpose layout. +class XeGPUCreateNdDescOpPattern final + : public OpConversionPattern<xegpu::CreateNdDescOp> { +public: + using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto tdescTy = createNdOp.getType(); + // Get the target uArch info. + auto chipStr = xegpu::getChipStr(createNdOp); + // Check if the chip is supported. + assert( + chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") && + "Expecting target chip to be pvc or bmg for transpose optimization."); + const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value()); + + auto convertType = tryOptimize(tdescTy, targetuArch); + if (convertType == tdescTy) + return failure(); + auto strides = createNdOp.getMixedStrides(); + auto maybeConstInnerStride = getConstantIntValue(strides.back()); + // Only row-major memrefs are expected for now. + if (!maybeConstInnerStride || *maybeConstInnerStride != 1) + return rewriter.notifyMatchFailure( + createNdOp, "Expecting row-major memref for transpose optimization."); + Value source = createNdOp.getSource(); + auto optionalLaneData = getMaybeLaneData(tdescTy); + assert(optionalLaneData && "Expected 2D lane data"); + auto laneData = optionalLaneData.value(); + int64_t innerLaneData = laneData[1]; + auto memrefType = dyn_cast<MemRefType>(source.getType()); + // Inner dimension of the shape must be adjusted based on innerLaneData. + SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes()); + modifiedShape.back() = divideByConstant( + rewriter, createNdOp.getLoc(), + convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()), + innerLaneData); + // Similarly, second to last stride must be adjusted. + assert(strides.size() >= 2 && + "Expected at least 2 strides for CreateNdDescOp"); + SmallVector<OpFoldResult> modifiedStrides(strides); + modifiedStrides[modifiedStrides.size() - 2] = divideByConstant( + rewriter, createNdOp.getLoc(), + convertToValue(rewriter, createNdOp.getLoc(), + modifiedStrides[modifiedStrides.size() - 2]), + innerLaneData); + + // If the source is a static memref, we need to extract the pointer to + // base address. + if (memrefType && memrefType.hasStaticShape()) { + auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create( + rewriter, createNdOp.getLoc(), source); + source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(), + rewriter.getI64Type(), + extractOp.getResult()) + .getResult(); + } + // Create a new CreateNdDescOp with the modified shape and converted type. + auto newCreateNdDescOp = xegpu::CreateNdDescOp::create( + rewriter, createNdOp.getLoc(), convertType, source, modifiedShape, + modifiedStrides); + rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult()); + return success(); + } +}; + +/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for +/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the +/// adjusted tensor desc type. This can result in multiple LoadNdOps being +/// generated to fill in the original load shape. +class XeGPULoadNdDescOpPattern final + : public OpConversionPattern<xegpu::LoadNdOp> { +public: + using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto origTensorDescType = loadNdOp.getTensorDescType(); + auto adaptorType = + cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType()); + if (adaptorType == origTensorDescType) + return failure(); + // Offsets must be adjusted based on innerLaneData. + auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value(); + int64_t innerLaneData = laneData[1]; + auto offsets = loadNdOp.getMixedOffsets(); + if (offsets.empty()) + return rewriter.notifyMatchFailure(loadNdOp, + "Expecting offsets in LoadNd"); + SmallVector<OpFoldResult> modifiedOffsets(offsets); + modifiedOffsets.back() = divideByConstant( + rewriter, loadNdOp.getLoc(), + convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()), + innerLaneData); + // Get the 2D data shape of this loadNdOp in its original type including + // array length. + SmallVector<int64_t> origDataShape(origTensorDescType.getShape()); + // Adjust the data shape based on innerLaneData. + origDataShape.back() /= innerLaneData; + // HW supported shape is the new tensor desc shape after conversion. + SmallVector<int64_t> hwSupportedShape(adaptorType.getShape()); + VectorType origVectorType = + VectorType::get(origDataShape, adaptorType.getElementType()); + Value data; + // Orig data shape is 3D for the array length case. + if (origTensorDescType.getArrayLength() > 1) { + SmallVector<Value> arraySlices; + for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) { + Value slice = arith::ConstantOp::create( + rewriter, loadNdOp->getLoc(), origVectorType, + rewriter.getZeroAttr(origVectorType)); + // Increase the Y offset for each array slice. + Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(), + modifiedOffsets.back()); + modifiedOffsets.back() = + arith::AddIOp::create( + rewriter, loadNdOp->getLoc(), offsetY, + arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(), + i * origDataShape[1]) + .getResult()) + .getResult(); + slice = generateLoads( + rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets, + cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()), + loadNdOp); + // BitCast back to original load shape without array length. + auto bitcastType = VectorType::get(origTensorDescType.getShape(), + origTensorDescType.getElementType()); + auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), + bitcastType, slice); + // BitCastOp must have the same layout as the original loadNdOp. + xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); + arraySlices.push_back(bitCastOp.getResult()); + } + rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices}); + return success(); + } + data = arith::ConstantOp::create( + rewriter, loadNdOp->getLoc(), + VectorType::get(origDataShape, adaptorType.getElementType()), + rewriter.getZeroAttr(origVectorType)); + data = generateLoads( + rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets, + cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()), + loadNdOp); + auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(), + loadNdOp.getType(), data); + // BitCastOp must have the same layout as the original loadNdOp. + xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0), + origTensorDescType.getLayoutAttr()); + rewriter.replaceOp(loadNdOp, bitCastOp); + return success(); + } +}; + +/// Vector ExtractOp must be processed if the original tensor desc type has +/// array length greater than 1. In this case, the LoadNdOp is replaced with +/// multiple LoadNdOps for each array slice making the extraction unnecessary. +/// In this case, we simply remove the ExtractOp. +class VectorExtractOpPattern final + : public OpConversionPattern<vector::ExtractOp> { +public: + using OpConversionPattern<vector::ExtractOp>::OpConversionPattern; + LogicalResult + matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // Check if the source of the extraction is split to multiple values. + if (adaptor.getSource().size() == 1) + return failure(); + auto mixedPos = extractOp.getMixedPosition(); + if (mixedPos.size() != 1) + return failure(); + auto mayBeInt = getConstantIntValue(mixedPos[0]); + if (!mayBeInt) + return failure(); + rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]); + return success(); + } +}; + +} // namespace + +void xegpu::populateXeGPUOptimizeBlockLoadsPatterns( + RewritePatternSet &patterns) { + patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern, + VectorExtractOpPattern>(patterns.getContext()); +} + +namespace { + +struct XeGPUOptimizeBlockLoadsPass final + : public xegpu::impl::XeGPUOptimizeBlockLoadsBase< + XeGPUOptimizeBlockLoadsPass> { + void runOnOperation() override { + MLIRContext &context = getContext(); + TypeConverter converter; + RewritePatternSet patterns(&context); + ConversionTarget target(context); + + // This pass is only meant for PVC and BMG targets. If unsupported target + // is found, exit early. + bool isTargetSupported = false; + getOperation()->walk([&](gpu::GPUFuncOp funcOp) { + auto chipStr = xegpu::getChipStr(funcOp); + if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg")) + isTargetSupported = true; + }); + + if (!isTargetSupported) { + DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets." + << "\n"; + return; + } + + // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be + // converted. + target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>( + [&](xegpu::CreateNdDescOp createNdOp) { + return !canBeOptimizedForTranspose(createNdOp.getType()); + }); + target.addDynamicallyLegalOp<xegpu::LoadNdOp>( + [&](xegpu::LoadNdOp loadNdOp) { + return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType()); + }); + // Vector ExtractOps can have optimizable layouts if they extract from + // LoadNdOps with array length greater than 1. These ExtractOps must be + // converted. + target.addDynamicallyLegalOp<vector::ExtractOp>( + [&](vector::ExtractOp extractOp) { + auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult()); + if (!layout) + return true; + auto laneLayout = layout.getEffectiveLaneLayoutAsInt(); + auto laneData = layout.getEffectiveLaneDataAsInt(); + return !canBeOptimizedForTranspose(laneLayout, laneData); + }); + converter.addConversion([](Type type) { return type; }); + + target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect, + vector::VectorDialect>(); + scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns, + target); + xegpu::populateXeGPUOptimizeBlockLoadsPatterns(patterns); + if (failed(applyPartialConversion(getOperation(), target, + std::move(patterns)))) { + DBGS() << "Optimize block loads pass failed.\n"; + return signalPassFailure(); + } + } +}; + +} // namespace diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 90eae871a5ef3..4e1a539771d2f 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -204,28 +204,6 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> { using Lattice::Lattice; }; -/// Helper Function to find a proper instruction multiple for the user-supplied -/// sg-level data shape. `candidates` are uArch allowed shapes. -/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count). -template <typename T> -int getLargestDivisor(T dim, ArrayRef<T> candidates, - ArrayRef<T> candidateMultiples = {}) { - static_assert(std::is_integral<T>::value, "T must be an integer type"); - int largest = -1; - SmallVector<T> multiples = {1}; - if (!candidateMultiples.empty()) - multiples = - SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end()); - for (T candidate : candidates) { - for (T multiple : multiples) { - int value = static_cast<int>(candidate * multiple); - if (value != 0 && dim % value == 0 && value > largest) - largest = value; - } - } - return largest; -} - /// Helper Functions to get default layouts. A `default layout` is a layout that /// is assigned to a value when the layout is not fixed by some anchor operation /// (like DPAS). @@ -505,7 +483,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp( prefetch.emitWarning("No known block params found for the element type."); auto [bWidth, bHeight, bCount] = blockWHC.value(); SmallVector<int> instData; - int instWidth = getLargestDivisor( + int instWidth = xegpu::getLargestDivisor( static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth, bCount); if (instWidth == -1) @@ -514,7 +492,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp( if (tdescTy.getRank() == 1) instData = {instWidth}; else { - int instHeight = getLargestDivisor( + int instHeight = xegpu::getLargestDivisor( static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight); if (instHeight == -1) prefetch.emitWarning( @@ -634,7 +612,7 @@ void LayoutInfoPropagation::visitDpasOp( const unsigned dataALen = aTy.getShape().front(); auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType()); const int maxALen = - getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen)); + xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen)); if (maxALen == -1) dpas.emitWarning( "No suitable instruction multiple found for the given shape."); @@ -642,7 +620,7 @@ void LayoutInfoPropagation::visitDpasOp( const unsigned dataBLen = bTy.getShape().back(); auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType()); const int maxBLen = - getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen)); + xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen)); if (maxBLen == -1) dpas.emitWarning( "No suitable instruction multiple found for the given shape."); @@ -662,7 +640,7 @@ void LayoutInfoPropagation::visitDpasOp( const unsigned dataCLen = bTy.getShape().back(); auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType()); const int maxCLen = - getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen)); + xegpu::getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen)); if (maxCLen == -1) dpas.emitWarning( "No suitable instruction multiple found for the given shape."); @@ -691,7 +669,7 @@ void LayoutInfoPropagation::visitStoreNdOp( store.emitWarning("No known block params found for the element type."); auto [bWidth, bHeight, bCount] = blockWHC.value(); SmallVector<int> instData; - int instWidth = getLargestDivisor( + int instWidth = xegpu::getLargestDivisor( static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth, bCount); if (instWidth == -1) @@ -700,7 +678,7 @@ void LayoutInfoPropagation::visitStoreNdOp( if (dataTy.getRank() == 1) instData = {instWidth}; else { - int instHeight = getLargestDivisor( + int instHeight = xegpu::getLargestDivisor( static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight); if (instHeight == -1) store.emitWarning( @@ -904,9 +882,16 @@ void LayoutInfoPropagation::visitStoreScatterOp( if (dstTdescTy.getChunkSizeAsInt() > 1) instData.push_back(chunkSize); } - LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo( - payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), - /*scattered=*/true); + + LayoutInfo payloadLayout; + + if (auto layout = storeScatter.getLayoutAttr()) { + payloadLayout = LayoutInfo(layout); + } else { + payloadLayout = getDefaultSIMTLayoutInfo( + payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), + /*scattered=*/true); + } LayoutInfo maskLayout = getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize); @@ -1041,7 +1026,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op, } // If the result is a vector type, add a temporary layout attribute to the // op. - xegpu::setDistributeLayoutAttr(result, layout); + xegpu::setDistributeLayoutAttr(result, layout, /*respectPermLayout*/ true); } return success(); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 5a3b27ec6108e..bbd7733e89c29 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Utils/DistributionUtils.h" +#include "mlir/Dialect/Index/IR/IndexDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h" @@ -912,6 +913,186 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern { } }; +static SmallVector<Value> computeDistributedCoordinatesForMatrixOp( + PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout, + Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) { + SmallVector<Value> newCoods; + auto maybeCoords = + layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape); + if (failed(maybeCoords)) + return {}; + assert(maybeCoords.value().size() == 1 && + "Expected one set of distributed offsets"); + SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned( + rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]), + getAsOpFoldResult(origOffsets)); + newCoods = llvm::to_vector(llvm::map_range( + ofrVec, [&](OpFoldResult ofr) -> Value { return cast<Value>(ofr); })); + return newCoods; +} + +/// Pattern for distributing xegpu::LoadMatrixOp. +struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + gpu::YieldOp yield = warpOp.getTerminator(); + Operation *lastNode = yield->getPrevNode(); + auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode); + if (!matrixOp) + return failure(); + + OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) { + return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op; + }); + if (!producedByLastLoad) + return rewriter.notifyMatchFailure( + warpOp, "The last op is not xegpu::LoadMatrixOp"); + const int operandIdx = producedByLastLoad->getOperandNumber(); + + VectorType sgPayloadTy = + dyn_cast<VectorType>(matrixOp.getResult().getType()); + VectorType warpResultTy = + cast<VectorType>(warpOp.getResult(operandIdx).getType()); + if (!sgPayloadTy) + return rewriter.notifyMatchFailure( + matrixOp, "the matrix op payload must be a vector type"); + + auto loc = matrixOp.getLoc(); + auto offsets = matrixOp.getMixedOffsets(); + if (offsets.empty()) + return rewriter.notifyMatchFailure(matrixOp, + "the load op must have offsets"); + SmallVector<Value> offsetsAsValues = + vector::getAsValues(rewriter, matrixOp.getLoc(), offsets); + + auto layout = matrixOp.getLayoutAttr(); + if (!layout) + return rewriter.notifyMatchFailure( + matrixOp, "the matrix operation lacks layout attribute"); + + FailureOr<VectorType> distPayloadByWarpOpOrFailure = + getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy); + if (failed(distPayloadByWarpOpOrFailure)) + return rewriter.notifyMatchFailure( + matrixOp, "Failed to distribute matrix op payload based on layout."); + + SmallVector<Value> operands = {matrixOp.getMemDesc()}; + const unsigned offsetsStartIdx = operands.size(); + operands.append(offsetsAsValues); + + SmallVector<Type> operandTypes = llvm::to_vector( + llvm::map_range(operands, [](Value v) { return v.getType(); })); + + SmallVector<size_t> newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, operands, operandTypes, newRetIndices); + SmallVector<Value> newOperands = llvm::map_to_vector( + newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); }); + + SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()}; + std::fill(newConstOffsets.begin(), newConstOffsets.end(), + ShapedType::kDynamic); + DenseI64ArrayAttr newConstOffsetsAttr = + rewriter.getDenseI64ArrayAttr(newConstOffsets); + ValueRange currentOffsets = + ValueRange(newOperands).drop_front(offsetsStartIdx); + + SmallVector<Value> newCoords = currentOffsets; + rewriter.setInsertionPointAfter(newWarpOp); + + if (!matrixOp.getSubgroupBlockIoAttr()) { + newCoords = computeDistributedCoordinatesForMatrixOp( + rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(), + currentOffsets); + } + xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create( + rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure, + newOperands[0], ValueRange(newCoords), newConstOffsetsAttr, + matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{}); + // Resolve the output type and replace all uses. + rewriter.replaceAllUsesWith( + newWarpOp.getResult(operandIdx), + resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter)); + return success(); + } +}; + +/// Pattern for distributing xegpu::StoreMatrixOp. +struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern { + using gpu::WarpDistributionPattern::WarpDistributionPattern; + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, + PatternRewriter &rewriter) const override { + gpu::YieldOp yield = warpOp.getTerminator(); + Operation *lastNode = yield->getPrevNode(); + auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode); + if (!matrixOp) + return failure(); + + VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType()); + if (!sgPayloadTy) + return rewriter.notifyMatchFailure( + matrixOp, "the matrix op payload must be a vector type"); + + auto loc = matrixOp.getLoc(); + auto offsets = matrixOp.getMixedOffsets(); + if (offsets.empty()) + return rewriter.notifyMatchFailure(matrixOp, + "the store op must have offsets"); + SmallVector<Value> offsetsAsValues = + vector::getAsValues(rewriter, matrixOp.getLoc(), offsets); + + auto layout = matrixOp.getLayoutAttr(); + if (!layout) + return rewriter.notifyMatchFailure( + matrixOp, "the matrix operation lacks layout attribute"); + + FailureOr<VectorType> distPayloadByWarpOpOrFailure = + getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy); + if (failed(distPayloadByWarpOpOrFailure)) + return rewriter.notifyMatchFailure( + matrixOp, "Failed to distribute matrix op payload based on layout."); + + SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()}; + const unsigned offsetsStartIdx = operands.size(); + operands.append(offsetsAsValues); + + SmallVector<Type> operandTypes = llvm::to_vector( + llvm::map_range(operands, [](Value v) { return v.getType(); })); + operandTypes[0] = *distPayloadByWarpOpOrFailure; + + SmallVector<size_t> newRetIndices; + gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( + rewriter, warpOp, operands, operandTypes, newRetIndices); + SmallVector<Value> newOperands = llvm::map_to_vector( + newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); }); + + SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()}; + std::fill(newConstOffsets.begin(), newConstOffsets.end(), + ShapedType::kDynamic); + DenseI64ArrayAttr newConstOffsetsAttr = + rewriter.getDenseI64ArrayAttr(newConstOffsets); + ValueRange currentOffsets = + ValueRange(newOperands).drop_front(offsetsStartIdx); + + SmallVector<Value> newCoords = currentOffsets; + rewriter.setInsertionPointAfter(newWarpOp); + + if (!matrixOp.getSubgroupBlockIoAttr()) { + newCoords = computeDistributedCoordinatesForMatrixOp( + rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(), + currentOffsets); + } + + xegpu::StoreMatrixOp::create( + rewriter, loc, TypeRange{}, newOperands[0], newOperands[1], + ValueRange(newCoords), newConstOffsetsAttr, + matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{}); + rewriter.eraseOp(matrixOp); + return success(); + } +}; + /// Distribute a scattered load op. The logic and requirements are the same as /// for the scattered store distribution. The warpOp's payload vector is /// expected to be distributed by the load's result consumer. @@ -1443,7 +1624,8 @@ void xegpu::populateXeGPUSubgroupDistributePatterns( LoadNdDistribution, DpasDistribution, PrefetchNdDistribution, GpuBarrierDistribution, VectorMultiReductionDistribution, LoadDistribution, StoreDistribution, VectorTransposeDistribution, - VectorBitcastDistribution, + VectorBitcastDistribution, LoadMatrixDistribution, + StoreMatrixDistribution, MemrefExtractAlignedPointerAsIndexDistribution>( patterns.getContext(), /*pattern benefit=*/regularPatternBenefit); @@ -1468,6 +1650,8 @@ void XeGPUSubgroupDistributePass::runOnOperation() { // Layouts are needed for vector type only. if (!isa<VectorType>(operand.get().getType())) continue; + if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(op)) + continue; auto layout = xegpu::getDistributeLayoutAttr(operand.get()); if (!layout) { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index e6e71cc29a80a..c3bf9606693a8 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -678,12 +678,16 @@ struct UnrollLoadGatherOpWithOffset pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter); } + auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr()); + if (layout) + layout = layout.dropInstData(); + SmallVector<Value> newOps; for (auto [o, m] : llvm::zip(convertedOffsets, convertedMasks)) { auto newOp = xegpu::LoadGatherOp::create( rewriter, loc, newValueTy, op.getSource(), o, m, rewriter.getI64IntegerAttr(chunkSize), op.getL1HintAttr(), - op.getL2HintAttr(), op.getL3HintAttr()); + op.getL2HintAttr(), op.getL3HintAttr(), layout); newOps.push_back(newOp); } @@ -774,12 +778,16 @@ struct UnrollStoreScatterOpWithOffsets SmallVector<Value> convertedValues = pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter); + auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr()); + if (layout) + layout = layout.dropInstData(); + for (auto [v, o, m] : llvm::zip(convertedValues, convertedOffsets, convertedMasks)) { xegpu::StoreScatterOp::create(rewriter, loc, v, op.getDest(), o, m, rewriter.getI64IntegerAttr(chunkSize), op.getL1HintAttr(), op.getL2HintAttr(), - op.getL3HintAttr()); + op.getL3HintAttr(), layout); } rewriter.eraseOp(op); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 9fc5ad9af5c7b..0a9ef0aa6df96 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -114,7 +114,8 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op, // Compute the list of subgroup-relative offsets for sub-tensors or sub-memory // descriptors to be accessed, based on the layout information. ArrayRef<int64_t> wgShape = op.getDataShape(); - auto maybeDescOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape); + auto maybeDescOffsets = + layout.computeDistributedCoords(rewriter, loc, sgId, wgShape); if (failed(maybeDescOffsets)) return failure(); @@ -830,8 +831,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> { // Get subgroup id Value sgId = gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr); - - auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape); + auto sgOffsets = + layout.computeDistributedCoords(rewriter, loc, sgId, wgShape); if (failed(sgOffsets)) return failure(); @@ -888,8 +889,8 @@ struct WgToSgLoadGatherOpWithOffset return failure(); ArrayRef<int64_t> wgShape = resultType.getShape(); - xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getResult()); + xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>( + xegpu::getDistributeLayoutAttr(op.getResult())); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -914,9 +915,8 @@ struct WgToSgLoadGatherOpWithOffset llvm::zip(adaptor.getOffsets(), adaptor.getMask())) { auto newLoadOp = xegpu::LoadGatherOp::create( rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr, - op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr()); - xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0), - layout.dropSgLayoutAndData()); + op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(), + layout.dropSgLayoutAndData()); newLoadOps.push_back(newLoadOp); } rewriter.replaceOpWithMultiple(op, {newLoadOps}); @@ -941,8 +941,8 @@ struct WgToSgStoreScatterOpWithOffset if (!valueType) return failure(); - xegpu::DistributeLayoutAttr layout = - xegpu::getDistributeLayoutAttr(op.getOperand(0)); + xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>( + xegpu::getDistributeLayoutAttr(op.getOperand(0))); if (!layout || !layout.isForWorkgroup()) return failure(); @@ -964,7 +964,8 @@ struct WgToSgStoreScatterOpWithOffset adaptor.getValue(), adaptor.getOffsets(), adaptor.getMask())) { auto store = xegpu::StoreScatterOp::create( rewriter, loc, val, op.getDest(), offs, mask, chunkSizeAttr, - op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr()); + op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(), + layout.dropSgLayoutAndData()); // Update the layout attribute to drop sg_layout and sg_data. if (!layout.getEffectiveLaneLayoutAsInt().empty() || !layout.getEffectiveInstDataAsInt().empty()) { @@ -1052,7 +1053,8 @@ struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> { Value sgId = gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr); - auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape); + auto sgOffsets = + layout.computeDistributedCoords(rewriter, loc, sgId, wgShape); if (failed(sgOffsets)) return failure(); @@ -1217,6 +1219,70 @@ struct WgToSgMultiDimReductionOp } }; +// This pattern transforms vector.transpose ops to work at subgroup level. +struct WgToSgVectorTransposeOp + : public OpConversionPattern<vector::TransposeOp> { + using OpConversionPattern<vector::TransposeOp>::OpConversionPattern; + + LogicalResult + matchAndRewrite(vector::TransposeOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + VectorType resultType = op.getResultVectorType(); + + ArrayRef<int64_t> wgShape = resultType.getShape(); + xegpu::DistributeLayoutAttr layout = + xegpu::getDistributeLayoutAttr(op.getResult()); + if (!layout || !layout.isForWorkgroup()) + return failure(); + + xegpu::DistributeLayoutAttr sourceLayout = + xegpu::getDistributeLayoutAttr(op.getVector()); + if (!sourceLayout || !sourceLayout.isForWorkgroup()) + return failure(); + + SmallVector<int64_t> sourceSgLayout = + sourceLayout.getEffectiveSgLayoutAsInt(); + SmallVector<int64_t> resultSgLayout = layout.getEffectiveSgLayoutAsInt(); + DenseI32ArrayAttr sourceOrder = sourceLayout.getOrder(); + DenseI32ArrayAttr resultOrder = layout.getOrder(); + + if (!sourceOrder || !resultOrder) { + return rewriter.notifyMatchFailure( + op, "Both source and result must have order attributes"); + } + + ArrayRef<int64_t> permutation = op.getPermutation(); + size_t permutationSize = permutation.size(); + if (sourceSgLayout.size() != permutationSize || + resultSgLayout.size() != permutationSize) { + return rewriter.notifyMatchFailure( + op, "Layouts and permutation must have the same rank"); + } + + // Check that sgLayout, sgData & order are properly transposed for source + // and result + if (!layout.isTransposeOf(sourceLayout, permutation)) + return rewriter.notifyMatchFailure( + op, "Result layout is not a valid transpose of source layout " + "according to permutation"); + + SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first; + VectorType newResultType = + VectorType::get(sgShape, resultType.getElementType()); + SmallVector<Value> newTransposeOps; + for (auto src : adaptor.getVector()) { + auto newTranspose = vector::TransposeOp::create( + rewriter, op.getLoc(), newResultType, src, permutation); + xegpu::setDistributeLayoutAttr(newTranspose->getResult(0), + layout.dropSgLayoutAndData()); + newTransposeOps.push_back(newTranspose.getResult()); + } + + rewriter.replaceOpWithMultiple(op, {newTransposeOps}); + return success(); + } +}; + } // namespace namespace mlir { @@ -1231,7 +1297,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) { WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset, WgToSgStoreScatterOpWithOffset, WgToSgLoadMatrixOp, WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp, - WgToSgMultiDimReductionOp>(patterns.getContext()); + WgToSgMultiDimReductionOp, WgToSgVectorTransposeOp>( + patterns.getContext()); } } // namespace xegpu } // namespace mlir @@ -1358,7 +1425,9 @@ void XeGPUWgToSgDistributePass::runOnOperation() { return isLegal(layout); }); - target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp>( + target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp, + vector::TransposeOp, vector::BroadcastOp, + vector::MultiDimReductionOp>( [=](Operation *op) -> bool { // Check for either a SliceAttr or LayoutAttr on the result. auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0)); @@ -1377,16 +1446,6 @@ void XeGPUWgToSgDistributePass::runOnOperation() { return isLegal(layout); }); - target.addDynamicallyLegalOp<vector::BroadcastOp>( - [=](vector::BroadcastOp op) -> bool { - return isLegal(xegpu::getDistributeLayoutAttr(op.getResult())); - }); - - target.addDynamicallyLegalOp<vector::MultiDimReductionOp>( - [=](vector::MultiDimReductionOp op) -> bool { - return isLegal(xegpu::getDistributeLayoutAttr(op.getResult())); - }); - target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>( [=](xegpu::ConvertLayoutOp op) -> bool { return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout()); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index a38993e0c55b1..de9e09d427665 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -144,6 +144,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName); + + // check for "permament" layout only after "temporary" layout name lookup + // for backward compatibility + if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp)) + return loadGatherOp.getLayoutAttr(); } if (auto arg = dyn_cast<BlockArgument>(value)) { @@ -171,27 +176,77 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName); + + // check for "permament" layout only after "temporary" layout name lookup + if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op)) + if (auto layout = storeScatterOp.getLayoutAttr()) + return layout; + return getDistributeLayoutAttr(opr.get()); } +// Returns the permanent layout attribute for the given result if it's +// available on the defining op. Otherwise returns the provided layout. +xegpu::DistributeLayoutAttr +maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, + const OpResult &result, mlir::Operation *owner, + const std::string &name) { + xegpu::DistributeLayoutAttr candidate = layout; + + if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) { + if (auto perm = loadOp.getLayoutAttr()) + candidate = perm; + } + + return candidate; +} + +// Returns the permanent layout attribute for the given operand if it's +// available on the defining op. Otherwise returns the provided layout. +xegpu::DistributeLayoutAttr +maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout, + const OpOperand &operand, mlir::Operation *owner, + const std::string &name) { + xegpu::DistributeLayoutAttr candidate = layout; + unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber(); + + if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) { + if (idx == 0) { + if (auto perm = storeOp.getLayoutAttr()) + candidate = perm; + } + } + + return candidate; +} + template <typename T, typename> void xegpu::setDistributeLayoutAttr(const T &operandOrResult, - const DistributeLayoutAttr layout) { + const DistributeLayoutAttr layout, + bool respectPermLayout) { Operation *owner = operandOrResult.getOwner(); std::string name = xegpu::getLayoutName(operandOrResult); - if (layout && !owner->hasAttrOfType<DistributeLayoutAttr>(name)) - owner->setAttr(name, layout); + + if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) + return; + + DistributeLayoutAttr candidate = layout; + if (respectPermLayout) + candidate = maybePickPermanentLayout(layout, operandOrResult, owner, name); + + if (candidate) + owner->setAttr(name, candidate); } // Explicit instantiation for OpResult template void xegpu::setDistributeLayoutAttr<mlir::OpResult>( const mlir::OpResult &result, - const mlir::xegpu::DistributeLayoutAttr layout); + const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout); // Explicit instantiation for OpOperand template void xegpu::setDistributeLayoutAttr<mlir::OpOperand>( const mlir::OpOperand &operand, - const mlir::xegpu::DistributeLayoutAttr layout); + const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout); void xegpu::setDistributeLayoutAttrs( Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) { @@ -500,3 +555,29 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc, results.append(addElementwise(builder, loc, a, b)); return results; } + +template <typename T> +int xegpu::getLargestDivisor(T dim, ArrayRef<T> candidates, + ArrayRef<T> candidateMultiples) { + static_assert(std::is_integral<T>::value, "T must be an integer type"); + int largest = -1; + SmallVector<T> multiples = {1}; + if (!candidateMultiples.empty()) + multiples = + SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end()); + for (T candidate : candidates) { + for (T multiple : multiples) { + int value = static_cast<int>(candidate * multiple); + if (value != 0 && dim % value == 0 && value > largest) + largest = value; + } + } + return largest; +} + +/// Explicit instantiations +template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates, + ArrayRef<int> candidateMultiples); +template int +xegpu::getLargestDivisor<unsigned>(unsigned dim, ArrayRef<unsigned> candidates, + ArrayRef<unsigned> candidateMultiples); diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp index 3d86b09b32538..0964e1b8c5ef3 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp @@ -36,9 +36,6 @@ using mlir::LLVM::detail::createIntrinsicCall; static llvm::Intrinsic::ID getReduxIntrinsicId(llvm::Type *resultType, NVVM::ReduxKind kind, bool hasAbs, bool hasNaN) { - if (!(resultType->isIntegerTy(32) || resultType->isFloatTy())) - llvm_unreachable("unsupported data type for redux"); - switch (kind) { case NVVM::ReduxKind::ADD: return llvm::Intrinsic::nvvm_redux_sync_add; diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 3a23bbfd70eac..f8c38fadbd229 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -92,6 +92,22 @@ static OpBuilder::InsertPoint computeInsertPoint(ArrayRef<Value> vals) { return pt; } +namespace { +enum OpConversionMode { + /// In this mode, the conversion will ignore failed conversions to allow + /// illegal operations to co-exist in the IR. + Partial, + + /// In this mode, all operations must be legal for the given target for the + /// conversion to succeed. + Full, + + /// In this mode, operations are analyzed for legality. No actual rewrites are + /// applied to the operations on success. + Analysis, +}; +} // namespace + //===----------------------------------------------------------------------===// // ConversionValueMapping //===----------------------------------------------------------------------===// @@ -866,8 +882,9 @@ namespace mlir { namespace detail { struct ConversionPatternRewriterImpl : public RewriterBase::Listener { explicit ConversionPatternRewriterImpl(ConversionPatternRewriter &rewriter, - const ConversionConfig &config) - : rewriter(rewriter), config(config), + const ConversionConfig &config, + OperationConverter &opConverter) + : rewriter(rewriter), config(config), opConverter(opConverter), notifyingRewriter(rewriter.getContext(), config.listener) {} //===--------------------------------------------------------------------===// @@ -1105,10 +1122,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// A set of operations that were modified by the current pattern. SetVector<Operation *> patternModifiedOps; - /// A set of blocks that were inserted (newly-created blocks or moved blocks) - /// by the current pattern. - SetVector<Block *> patternInsertedBlocks; - /// A list of unresolved materializations that were created by the current /// pattern. DenseSet<UnrealizedConversionCastOp> patternMaterializations; @@ -1128,6 +1141,9 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener { /// Dialect conversion configuration. const ConversionConfig &config; + /// The operation converter to use for recursive legalization. + OperationConverter &opConverter; + /// A set of erased operations. This set is utilized only if /// `allowPatternRollback` is set to "false". Conceptually, this set is /// similar to `replacedOps` (which is maintained when the flag is set to @@ -2046,8 +2062,6 @@ void ConversionPatternRewriterImpl::notifyBlockInserted( if (!config.allowPatternRollback && config.listener) config.listener->notifyBlockInserted(block, previous, previousIt); - patternInsertedBlocks.insert(block); - if (wasDetached) { // If the block was detached, it is most likely a newly created block. if (config.allowPatternRollback) { @@ -2090,9 +2104,10 @@ void ConversionPatternRewriterImpl::notifyMatchFailure( //===----------------------------------------------------------------------===// ConversionPatternRewriter::ConversionPatternRewriter( - MLIRContext *ctx, const ConversionConfig &config) - : PatternRewriter(ctx), - impl(new detail::ConversionPatternRewriterImpl(*this, config)) { + MLIRContext *ctx, const ConversionConfig &config, + OperationConverter &opConverter) + : PatternRewriter(ctx), impl(new detail::ConversionPatternRewriterImpl( + *this, config, opConverter)) { setListener(impl.get()); } @@ -2213,6 +2228,37 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys, return success(); } +LogicalResult ConversionPatternRewriter::legalize(Region *r) { + // Fast path: If the region is empty, there is nothing to legalize. + if (r->empty()) + return success(); + + // Gather a list of all operations to legalize. This is done before + // converting the entry block signature because unrealized_conversion_cast + // ops should not be included. + SmallVector<Operation *> ops; + for (Block &b : *r) + for (Operation &op : b) + ops.push_back(&op); + + // If the current pattern runs with a type converter, convert the entry block + // signature. + if (const TypeConverter *converter = impl->currentTypeConverter) { + std::optional<TypeConverter::SignatureConversion> conversion = + converter->convertBlockSignature(&r->front()); + if (!conversion) + return failure(); + applySignatureConversion(&r->front(), *conversion, converter); + } + + // Legalize all operations in the region. + for (Operation *op : ops) + if (failed(legalize(op))) + return failure(); + + return success(); +} + void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest, Block::iterator before, ValueRange argValues) { @@ -2399,17 +2445,12 @@ class OperationLegalizer { bool canApplyPattern(Operation *op, const Pattern &pattern); /// Legalize the resultant IR after successfully applying the given pattern. - LogicalResult legalizePatternResult(Operation *op, const Pattern &pattern, - const RewriterState &curState, - const SetVector<Operation *> &newOps, - const SetVector<Operation *> &modifiedOps, - const SetVector<Block *> &insertedBlocks); - - /// Legalizes the actions registered during the execution of a pattern. LogicalResult - legalizePatternBlockRewrites(Operation *op, - const SetVector<Block *> &insertedBlocks, - const SetVector<Operation *> &newOps); + legalizePatternResult(Operation *op, const Pattern &pattern, + const RewriterState &curState, + const SetVector<Operation *> &newOps, + const SetVector<Operation *> &modifiedOps); + LogicalResult legalizePatternCreatedOperations(const SetVector<Operation *> &newOps); LogicalResult @@ -2608,7 +2649,6 @@ LogicalResult OperationLegalizer::legalizeWithFold(Operation *op) { auto cleanup = llvm::make_scope_exit([&]() { rewriterImpl.patternNewOps.clear(); rewriterImpl.patternModifiedOps.clear(); - rewriterImpl.patternInsertedBlocks.clear(); }); // Upon failure, undo all changes made by the folder. @@ -2662,24 +2702,16 @@ LogicalResult OperationLegalizer::legalizeWithFold(Operation *op) { static void reportNewIrLegalizationFatalError(const Pattern &pattern, const SetVector<Operation *> &newOps, - const SetVector<Operation *> &modifiedOps, - const SetVector<Block *> &insertedBlocks) { + const SetVector<Operation *> &modifiedOps) { auto newOpNames = llvm::map_range( newOps, [](Operation *op) { return op->getName().getStringRef(); }); auto modifiedOpNames = llvm::map_range( modifiedOps, [](Operation *op) { return op->getName().getStringRef(); }); - StringRef detachedBlockStr = "(detached block)"; - auto insertedBlockNames = llvm::map_range(insertedBlocks, [&](Block *block) { - if (block->getParentOp()) - return block->getParentOp()->getName().getStringRef(); - return detachedBlockStr; - }); - llvm::report_fatal_error( - "pattern '" + pattern.getDebugName() + - "' produced IR that could not be legalized. " + "new ops: {" + - llvm::join(newOpNames, ", ") + "}, " + "modified ops: {" + - llvm::join(modifiedOpNames, ", ") + "}, " + "inserted block into ops: {" + - llvm::join(insertedBlockNames, ", ") + "}"); + llvm::report_fatal_error("pattern '" + pattern.getDebugName() + + "' produced IR that could not be legalized. " + + "new ops: {" + llvm::join(newOpNames, ", ") + "}, " + + "modified ops: {" + + llvm::join(modifiedOpNames, ", ") + "}"); } LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) { @@ -2743,7 +2775,6 @@ LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) { } rewriterImpl.patternNewOps.clear(); rewriterImpl.patternModifiedOps.clear(); - rewriterImpl.patternInsertedBlocks.clear(); LLVM_DEBUG({ logFailure(rewriterImpl.logger, "pattern failed to match"); if (rewriterImpl.config.notifyCallback) { @@ -2777,15 +2808,12 @@ LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) { SetVector<Operation *> newOps = moveAndReset(rewriterImpl.patternNewOps); SetVector<Operation *> modifiedOps = moveAndReset(rewriterImpl.patternModifiedOps); - SetVector<Block *> insertedBlocks = - moveAndReset(rewriterImpl.patternInsertedBlocks); - auto result = legalizePatternResult(op, pattern, curState, newOps, - modifiedOps, insertedBlocks); + auto result = + legalizePatternResult(op, pattern, curState, newOps, modifiedOps); appliedPatterns.erase(&pattern); if (failed(result)) { if (!rewriterImpl.config.allowPatternRollback) - reportNewIrLegalizationFatalError(pattern, newOps, modifiedOps, - insertedBlocks); + reportNewIrLegalizationFatalError(pattern, newOps, modifiedOps); rewriterImpl.resetState(curState, pattern.getDebugName()); } if (config.listener) @@ -2823,8 +2851,7 @@ bool OperationLegalizer::canApplyPattern(Operation *op, LogicalResult OperationLegalizer::legalizePatternResult( Operation *op, const Pattern &pattern, const RewriterState &curState, const SetVector<Operation *> &newOps, - const SetVector<Operation *> &modifiedOps, - const SetVector<Block *> &insertedBlocks) { + const SetVector<Operation *> &modifiedOps) { [[maybe_unused]] auto &impl = rewriter.getImpl(); assert(impl.pendingRootUpdates.empty() && "dangling root updates"); @@ -2843,8 +2870,7 @@ LogicalResult OperationLegalizer::legalizePatternResult( #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS // Legalize each of the actions registered during application. - if (failed(legalizePatternBlockRewrites(op, insertedBlocks, newOps)) || - failed(legalizePatternRootUpdates(modifiedOps)) || + if (failed(legalizePatternRootUpdates(modifiedOps)) || failed(legalizePatternCreatedOperations(newOps))) { return failure(); } @@ -2853,53 +2879,6 @@ LogicalResult OperationLegalizer::legalizePatternResult( return success(); } -LogicalResult OperationLegalizer::legalizePatternBlockRewrites( - Operation *op, const SetVector<Block *> &insertedBlocks, - const SetVector<Operation *> &newOps) { - ConversionPatternRewriterImpl &impl = rewriter.getImpl(); - SmallPtrSet<Operation *, 16> alreadyLegalized; - - // If the pattern moved or created any blocks, make sure the types of block - // arguments get legalized. - for (Block *block : insertedBlocks) { - if (impl.erasedBlocks.contains(block)) - continue; - - // Only check blocks outside of the current operation. - Operation *parentOp = block->getParentOp(); - if (!parentOp || parentOp == op || block->getNumArguments() == 0) - continue; - - // If the region of the block has a type converter, try to convert the block - // directly. - if (auto *converter = impl.regionToConverter.lookup(block->getParent())) { - std::optional<TypeConverter::SignatureConversion> conversion = - converter->convertBlockSignature(block); - if (!conversion) { - LLVM_DEBUG(logFailure(impl.logger, "failed to convert types of moved " - "block")); - return failure(); - } - impl.applySignatureConversion(block, converter, *conversion); - continue; - } - - // Otherwise, try to legalize the parent operation if it was not generated - // by this pattern. This is because we will attempt to legalize the parent - // operation, and blocks in regions created by this pattern will already be - // legalized later on. - if (!newOps.count(parentOp) && alreadyLegalized.insert(parentOp).second) { - if (failed(legalize(parentOp))) { - LLVM_DEBUG(logFailure( - impl.logger, "operation '{0}'({1}) became illegal after rewrite", - parentOp->getName(), parentOp)); - return failure(); - } - } - } - return success(); -} - LogicalResult OperationLegalizer::legalizePatternCreatedOperations( const SetVector<Operation *> &newOps) { for (Operation *op : newOps) { @@ -3265,22 +3244,6 @@ static void reconcileUnrealizedCasts( // OperationConverter //===----------------------------------------------------------------------===// -namespace { -enum OpConversionMode { - /// In this mode, the conversion will ignore failed conversions to allow - /// illegal operations to co-exist in the IR. - Partial, - - /// In this mode, all operations must be legal for the given target for the - /// conversion to succeed. - Full, - - /// In this mode, operations are analyzed for legality. No actual rewrites are - /// applied to the operations on success. - Analysis, -}; -} // namespace - namespace mlir { // This class converts operations to a given conversion target via a set of // rewrite patterns. The conversion behaves differently depending on the @@ -3290,16 +3253,20 @@ struct OperationConverter { const FrozenRewritePatternSet &patterns, const ConversionConfig &config, OpConversionMode mode) - : rewriter(ctx, config), opLegalizer(rewriter, target, patterns), + : rewriter(ctx, config, *this), opLegalizer(rewriter, target, patterns), mode(mode) {} /// Converts the given operations to the conversion target. LogicalResult convertOperations(ArrayRef<Operation *> ops); -private: - /// Converts an operation with the given rewriter. - LogicalResult convert(Operation *op); + /// Converts a single operation. If `isRecursiveLegalization` is "true", the + /// conversion is a recursive legalization request, triggered from within a + /// pattern. In that case, do not emit errors because there will be another + /// attempt at legalizing the operation later (via the regular pre-order + /// legalization mechanism). + LogicalResult convert(Operation *op, bool isRecursiveLegalization = false); +private: /// The rewriter to use when converting operations. ConversionPatternRewriter rewriter; @@ -3311,32 +3278,42 @@ struct OperationConverter { }; } // namespace mlir -LogicalResult OperationConverter::convert(Operation *op) { +LogicalResult ConversionPatternRewriter::legalize(Operation *op) { + return impl->opConverter.convert(op, /*isRecursiveLegalization=*/true); +} + +LogicalResult OperationConverter::convert(Operation *op, + bool isRecursiveLegalization) { const ConversionConfig &config = rewriter.getConfig(); // Legalize the given operation. if (failed(opLegalizer.legalize(op))) { // Handle the case of a failed conversion for each of the different modes. // Full conversions expect all operations to be converted. - if (mode == OpConversionMode::Full) - return op->emitError() - << "failed to legalize operation '" << op->getName() << "'"; + if (mode == OpConversionMode::Full) { + if (!isRecursiveLegalization) + op->emitError() << "failed to legalize operation '" << op->getName() + << "'"; + return failure(); + } // Partial conversions allow conversions to fail iff the operation was not // explicitly marked as illegal. If the user provided a `unlegalizedOps` // set, non-legalizable ops are added to that set. if (mode == OpConversionMode::Partial) { - if (opLegalizer.isIllegal(op)) - return op->emitError() - << "failed to legalize operation '" << op->getName() - << "' that was explicitly marked illegal"; - if (config.unlegalizedOps) + if (opLegalizer.isIllegal(op)) { + if (!isRecursiveLegalization) + op->emitError() << "failed to legalize operation '" << op->getName() + << "' that was explicitly marked illegal"; + return failure(); + } + if (config.unlegalizedOps && !isRecursiveLegalization) config.unlegalizedOps->insert(op); } } else if (mode == OpConversionMode::Analysis) { // Analysis conversions don't fail if any operations fail to legalize, // they are only interested in the operations that were successfully // legalized. - if (config.legalizableOps) + if (config.legalizableOps && !isRecursiveLegalization) config.legalizableOps->insert(op); } return success(); @@ -3800,10 +3777,11 @@ static LogicalResult convertFuncOpTypes(FunctionOpInterface funcOp, TypeConverter::SignatureConversion result(type.getNumInputs()); SmallVector<Type, 1> newResults; if (failed(typeConverter.convertSignatureArgs(type.getInputs(), result)) || - failed(typeConverter.convertTypes(type.getResults(), newResults)) || - failed(rewriter.convertRegionTypes(&funcOp.getFunctionBody(), - typeConverter, &result))) + failed(typeConverter.convertTypes(type.getResults(), newResults))) return failure(); + if (!funcOp.getFunctionBody().empty()) + rewriter.applySignatureConversion(&funcOp.getFunctionBody().front(), result, + &typeConverter); // Update the function signature in-place. auto newType = FunctionType::get(rewriter.getContext(), diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py index 14c7380e432f0..d9ab504f0de54 100644 --- a/mlir/python/mlir/dialects/transform/structured.py +++ b/mlir/python/mlir/dialects/transform/structured.py @@ -713,6 +713,8 @@ def __init__( disable_transfer_permutation_map_lowering_patterns: bool = False, vectorize_nd_extract: bool = False, vectorize_padding: bool = False, + flatten_1d_depthwise_conv: bool = False, + fold_type_extensions_into_contract: bool = False, loc=None, ip=None, ): @@ -722,8 +724,10 @@ def __init__( target, disable_multi_reduction_to_contract_patterns=disable_multi_reduction_to_contract_patterns, disable_transfer_permutation_map_lowering_patterns=disable_transfer_permutation_map_lowering_patterns, + flatten_1d_depthwise_conv=flatten_1d_depthwise_conv, vectorize_nd_extract=vectorize_nd_extract, vectorize_padding=vectorize_padding, + fold_type_extensions_into_contract=fold_type_extensions_into_contract, loc=loc, ip=ip, ) diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir index ba12ff29ebef9..b5dcb01d3dc6b 100644 --- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir +++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir @@ -747,3 +747,29 @@ func.func @memref_bitcast(%1: memref<?xi16>) -> memref<?xbf16> { %2 = arith.bitcast %1 : memref<?xi16> to memref<?xbf16> func.return %2 : memref<?xbf16> } + +// ----- + +// CHECK-LABEL: func @unsupported_fp_type +// CHECK: arith.addf {{.*}} : f4E2M1FN +// CHECK: arith.addf {{.*}} : vector<4xf4E2M1FN> +// CHECK: arith.addf {{.*}} : vector<8x4xf4E2M1FN> +func.func @unsupported_fp_type(%arg0: f4E2M1FN, %arg1: vector<4xf4E2M1FN>, %arg2: vector<8x4xf4E2M1FN>) -> (f4E2M1FN, vector<4xf4E2M1FN>, vector<8x4xf4E2M1FN>) { + %0 = arith.addf %arg0, %arg0 : f4E2M1FN + %1 = arith.addf %arg1, %arg1 : vector<4xf4E2M1FN> + %2 = arith.addf %arg2, %arg2 : vector<8x4xf4E2M1FN> + return %0, %1, %2 : f4E2M1FN, vector<4xf4E2M1FN>, vector<8x4xf4E2M1FN> +} + +// ----- + +// CHECK-LABEL: func @supported_fp_type +// CHECK: llvm.fadd {{.*}} : f32 +// CHECK: llvm.fadd {{.*}} : vector<4xf32> +// CHECK-COUNT-4: llvm.fadd {{.*}} : vector<8xf32> +func.func @supported_fp_type(%arg0: f32, %arg1: vector<4xf32>, %arg2: vector<4x8xf32>) -> (f32, vector<4xf32>, vector<4x8xf32>) { + %0 = arith.addf %arg0, %arg0 : f32 + %1 = arith.addf %arg1, %arg1 : vector<4xf32> + %2 = arith.addf %arg2, %arg2 : vector<4x8xf32> + return %0, %1, %2 : f32, vector<4xf32>, vector<4x8xf32> +} diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir index a75f30d57fa74..cd8cfc8736915 100644 --- a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir +++ b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir @@ -275,6 +275,42 @@ func.func @reduction_minimumf(%v : vector<3xf32>, %s: f32) -> f32 { // ----- +// CHECK-LABEL: spirv.func @reduction_minnumf( +// CHECK-SAME: %[[V:.*]]: vector<3xf32>, +// CHECK-SAME: %[[S:.*]]: f32) -> f32 "None" { +// CHECK: %[[S0:.*]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32> +// CHECK: %[[S1:.*]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32> +// CHECK: %[[S2:.*]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32> +// CHECK: %[[MIN0:.*]] = spirv.GL.FMin %[[S0]], %[[S1]] : f32 +// CHECK: %[[MIN1:.*]] = spirv.GL.FMin %[[MIN0]], %[[S2]] : f32 +// CHECK: %[[MIN2:.*]] = spirv.GL.FMin %[[MIN1]], %[[S]] : f32 +// CHECK: spirv.ReturnValue %[[MIN2]] : f32 +// CHECK: } +func.func @reduction_minnumf(%v : vector<3xf32>, %s: f32) -> f32 { + %reduce = vector.reduction <minnumf>, %v, %s : vector<3xf32> into f32 + return %reduce : f32 +} + +// ----- + +// CHECK-LABEL: spirv.func @reduction_maxnumf( +// CHECK-SAME: %[[V:.*]]: vector<3xf32>, +// CHECK-SAME: %[[S:.*]]: f32) -> f32 "None" { +// CHECK: %[[S0:.*]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32> +// CHECK: %[[S1:.*]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32> +// CHECK: %[[S2:.*]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32> +// CHECK: %[[MAX0:.*]] = spirv.GL.FMax %[[S0]], %[[S1]] : f32 +// CHECK: %[[MAX1:.*]] = spirv.GL.FMax %[[MAX0]], %[[S2]] : f32 +// CHECK: %[[MAX2:.*]] = spirv.GL.FMax %[[MAX1]], %[[S]] : f32 +// CHECK: spirv.ReturnValue %[[MAX2]] : f32 +// CHECK: } +func.func @reduction_maxnumf(%v : vector<3xf32>, %s: f32) -> f32 { + %reduce = vector.reduction <maxnumf>, %v, %s : vector<3xf32> into f32 + return %reduce : f32 +} + +// ----- + // CHECK-LABEL: func @reduction_maxsi // CHECK-SAME: (%[[V:.+]]: vector<3xi32>, %[[S:.+]]: i32) // CHECK: %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xi32> diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 8cce6308018e2..dcf4ddb2dd48c 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -491,12 +491,12 @@ func.func @mbarrier() { // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]] + // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive %[[barPtr2]] %token = nvgpu.mbarrier.arrive %barrier[%c0] : !barrierType -> !tokenType // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] + // CHECK: nvvm.mbarrier.test.wait %[[barPtr3]], %[[token]] %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType func.return @@ -521,12 +521,12 @@ func.func @mbarrier_nocomplete() { // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]] + // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete %[[barPtr2]] %token = nvgpu.mbarrier.arrive.nocomplete %barrier[%c0], %count : !barrierType -> !tokenType // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 - // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]] + // CHECK: nvvm.mbarrier.test.wait %[[barPtr3]], %[[token]] %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType func.return @@ -572,7 +572,7 @@ func.func @mbarrier_wait(%barriers : !nvgpu.mbarrier.group<memorySpace = #gpu.ad // CHECK: %[[S3:.+]] = builtin.unrealized_conversion_cast %[[S2]] : index to i64 // CHECK: %[[S4:.+]] = llvm.extractvalue %[[CARG0]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // CHECK: %[[S5:.+]] = llvm.getelementptr %[[S4]][%[[S3]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64 -// CHECK: nvvm.mbarrier.test.wait.shared {{.*}}, %[[CARG1]] +// CHECK: nvvm.mbarrier.test.wait {{.*}}, %[[CARG1]] %mbarId = arith.remui %i, %numBarriers : index %isDone = nvgpu.mbarrier.test.wait %barriers[%mbarId], %token : !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>, num_barriers = 5>, !tokenType } diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index fbc4c0af60360..a9356c5cb60bb 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -88,10 +88,10 @@ func.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.p nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true} nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr - // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} - nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3> - // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true} - nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3> + // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} + nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3> + // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true} + nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3> llvm.return } diff --git a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir index 9908205f07c92..ae5141db16c09 100644 --- a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir @@ -9,11 +9,12 @@ func.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vecto // CHECK-LABEL: @load_1D_vector( // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32, +// CHECK-SAME: %[[COLLAPSED]] +// CHECK-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, // CHECK-SAME: boundary_check = false -// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf32> +// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32> // CHECK: return %[[VEC]] // ----- @@ -28,35 +29,29 @@ func.func @load_2D_vector(%source: memref<8x16x32xf32>, // CHECK-LABEL: @load_2D_vector( // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// CHECK-SAME: %[[COLLAPSED]] +// CHECK-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] // ----- func.func @load_dynamic_source(%source: memref<?x?x?xf32>, - %offset: index) -> vector<8x16xf32> { - %0 = vector.load %source[%offset, %offset, %offset] + %i: index, %j: index, %k: index) -> vector<8x16xf32> { + %0 = vector.load %source[%i, %j, %k] : memref<?x?x?xf32>, vector<8x16xf32> return %0 : vector<8x16xf32> } // CHECK-LABEL: @load_dynamic_source( // CHECK-SAME: %[[SRC:.+]]: memref<?x?x?xf32>, -// CHECK-SAME: %[[OFFSET:.+]]: index -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]] -// CHECK-DAG: %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]] -// CHECK-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] -// CHECK: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] -// CHECK-SAME: memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// CHECK-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] +// CHECK: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] // ----- @@ -72,9 +67,9 @@ func.func @load_out_of_bounds(%source: memref<7x15xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<7x15xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]]] +// CHECK-SAME: %[[SRC]] // CHECK-SAME: memref<7x15xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] // ----- diff --git a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir index 2c498dcc2a071..1a10d917623cc 100644 --- a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir @@ -11,11 +11,12 @@ func.func @store_1D_vector(%vec: vector<8xf32>, // CHECK-SAME: %[[VEC:.+]]: vector<8xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32, +// CHECK-SAME: %[[COLLAPSED]] +// CHECK-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, // CHECK-SAME: boundary_check = false -// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf32> +// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32> // ----- @@ -30,16 +31,17 @@ func.func @store_2D_vector(%vec: vector<8x16xf32>, // CHECK-SAME: %[[VEC:.+]]: vector<8x16xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// CHECK-SAME: %[[COLLAPSED]] +// CHECK-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> // ----- func.func @store_dynamic_source(%vec: vector<8x16xf32>, - %source: memref<?x?x?xf32>, %offset: index) { - vector.store %vec, %source[%offset, %offset, %offset] + %source: memref<?x?x?xf32>, %i: index, %j: index, %k: index) { + vector.store %vec, %source[%i, %j, %k] : memref<?x?x?xf32>, vector<8x16xf32> return } @@ -47,18 +49,11 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>, // CHECK-LABEL: @store_dynamic_source( // CHECK-SAME: %[[VEC:.+]]: vector<8x16xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<?x?x?xf32>, -// CHECK-SAME: %[[OFFSET:.+]]: index -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]] -// CHECK-DAG: %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]] -// CHECK-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] -// CHECK: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] -// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] -// CHECK-SAME: memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// CHECK-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// CHECK: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] +// CHECK: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32> // ----- @@ -74,9 +69,9 @@ func.func @store_out_of_bounds(%vec: vector<8x16xf32>, // CHECK-SAME: %[[SRC:.+]]: memref<7x64xf32>, // CHECK-SAME: %[[OFFSET:.+]]: index // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc -// CHECK-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]]] +// CHECK-SAME: %[[SRC]] // CHECK-SAME: memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// CHECK: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> // ----- diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir index c4ca79af1bd9a..c87a5304babfe 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir @@ -12,11 +12,12 @@ gpu.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vector // LOAD-ND-LABEL: @load_1D_vector( // LOAD-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // LOAD-ND-SAME: %[[OFFSET:.+]]: index +// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] // LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// LOAD-ND-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32, +// LOAD-ND-SAME: %[[COLLAPSED]] +// LOAD-ND-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, // LOAD-ND-SAME: boundary_check = false -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf32> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32> // LOAD-ND: return %[[VEC]] // LOAD-GATHER-LABEL: @load_1D_vector( @@ -46,11 +47,12 @@ gpu.func @load_2D_vector(%source: memref<8x16x32xf32>, // LOAD-ND-LABEL: @load_2D_vector( // LOAD-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // LOAD-ND-SAME: %[[OFFSET:.+]]: index +// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] // LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// LOAD-ND-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32, +// LOAD-ND-SAME: %[[COLLAPSED]] +// LOAD-ND-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, // LOAD-ND-SAME: boundary_check = false -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> // LOAD-ND: return %[[VEC]] // LOAD-GATHER-LABEL: @load_2D_vector( @@ -83,9 +85,9 @@ gpu.func @load_zero_pad_out_of_bounds(%source: memref<32x64xf32>, // LOAD-ND-LABEL: @load_zero_pad_out_of_bounds( // LOAD-ND-SAME: %[[SRC:.+]]: memref<32x64xf32>, // LOAD-ND-SAME: %[[OFFSET:.+]]: index -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]]] +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]] // LOAD-ND-SAME: memref<32x64xf32> -> !xegpu.tensor_desc<8x16xf32> -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32> // LOAD-ND: return %[[VEC]] // LOAD-GATHER-LABEL: @load_zero_pad_out_of_bounds( @@ -109,9 +111,9 @@ gpu.func @load_transposed(%source: memref<32x64xf32>, // LOAD-ND-SAME: %[[SRC:.+]]: memref<32x64xf32>, // LOAD-ND-SAME: %[[OFFSET1:.+]]: index, // LOAD-ND-SAME: %[[OFFSET2:.+]]: index -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET1]], %[[OFFSET2]]] +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]] // LOAD-ND-SAME: memref<32x64xf32> -> !xegpu.tensor_desc<16x8xf32 -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]] <{transpose = array<i64: 1, 0>}> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET1]], %[[OFFSET2]]] <{transpose = array<i64: 1, 0>}> // LOAD-ND-SAME: -> vector<8x16xf32> // LOAD-ND: return %[[VEC]] @@ -143,16 +145,11 @@ gpu.func @load_dynamic_source(%source: memref<?x?x?xf32>, } // LOAD-ND-LABEL: @load_dynamic_source( // LOAD-ND-SAME: %[[SRC:.+]]: memref<?x?x?xf32>, -// LOAD-ND-SAME: %[[OFFSET:.+]]: index -// LOAD-ND: %[[C2:.+]] = arith.constant 2 : index -// LOAD-ND: %[[C1:.+]] = arith.constant 1 : index -// LOAD-ND: %[[C0:.+]] = arith.constant 0 : index -// LOAD-ND-DAG: %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]] -// LOAD-ND-DAG: %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]] -// LOAD-ND-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] -// LOAD-ND: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET:.+]], %[[OFFSET:.+]], %[[OFFSET:.+]]] -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> +// LOAD-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] +// LOAD-ND: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32> // LOAD-ND: return %[[VEC]] @@ -184,10 +181,11 @@ gpu.func @load_dynamic_source2(%source: memref<?x8x16xf32>, } // LOAD-ND-LABEL: @load_dynamic_source2( -// LOAD-ND-DAG: %[[C0:.+]] = arith.constant 0 : index -// LOAD-ND-DAG: %[[DIM:.+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x8x16xf32> -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], shape : [%[[DIM]], 8, 16], strides : [128, 16, 1] : memref<?x8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<8x16xf32> +// LOAD-ND-SAME: %[[SRC:.+]]: memref<?x8x16xf32>, +// LOAD-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32, strided<[16, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<8x16xf32> // LOAD-ND: return %[[VEC]] : vector<8x16xf32> // LOAD-GATHER-LABEL: @load_dynamic_source2( @@ -418,11 +416,12 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2: // LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, // LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index // LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> +// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0] // LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]] -// LOAD-ND-SAME: memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16, +// LOAD-ND-SAME: %[[COLLAPSED]] +// LOAD-ND-SAME: memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16, // LOAD-ND-SAME: boundary_check = false -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf16> +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]]]{{.*}}-> vector<8xf16> // LOAD-ND: return %[[VEC]] // LOAD-GATHER-LABEL: @load_from_subview( diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir index fcfc9414da4f6..43a1a7206e2cc 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --xevm-attach-target='module=xevm.* O=3 chip=pvc' -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-ND +// RUN: mlir-opt %s --xevm-attach-target='module=xevm_* O=3 chip=pvc' -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-ND // RUN: mlir-opt %s -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-SCATTER @@ -15,11 +15,12 @@ gpu.func @store_1D_vector(%vec: vector<8xf32>, // STORE-ND-SAME: %[[VEC:.+]]: vector<8xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // STORE-ND-SAME: %[[OFFSET:.+]]: index +// STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] // STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// STORE-ND-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// STORE-ND-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32, +// STORE-ND-SAME: %[[COLLAPSED]] +// STORE-ND-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, // STORE-ND-SAME: boundary_check = false -// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf32> +// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32> // STORE-SCATTER-LABEL: @store_1D_vector( // STORE-SCATTER-SAME: %[[VEC:.+]]: vector<8xf32>, @@ -49,11 +50,12 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>, // STORE-ND-SAME: %[[VEC:.+]]: vector<8x16xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, // STORE-ND-SAME: %[[OFFSET:.+]]: index +// STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0] // STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// STORE-ND-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// STORE-ND-SAME: memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32, +// STORE-ND-SAME: %[[COLLAPSED]] +// STORE-ND-SAME: memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, // STORE-ND-SAME: boundary_check = false -// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> // STORE-SCATTER-LABEL: @store_2D_vector( // STORE-SCATTER-SAME: %[[VEC:.+]]: vector<8x16xf32>, @@ -73,8 +75,8 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>, // ----- gpu.module @xevm_module { gpu.func @store_dynamic_source(%vec: vector<8x16xf32>, - %source: memref<?x?x?xf32>, %offset: index) { - vector.transfer_write %vec, %source[%offset, %offset, %offset] + %source: memref<?x?x?xf32>, %i: index, %j: index, %k: index) { + vector.transfer_write %vec, %source[%i, %j, %k] {in_bounds = [true, true]} : vector<8x16xf32>, memref<?x?x?xf32> gpu.return @@ -83,18 +85,11 @@ gpu.func @store_dynamic_source(%vec: vector<8x16xf32>, // STORE-ND-LABEL: @store_dynamic_source( // STORE-ND-SAME: %[[VEC:.+]]: vector<8x16xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<?x?x?xf32>, -// STORE-ND-SAME: %[[OFFSET:.+]]: index -// STORE-ND-DAG: %[[C0:.+]] = arith.constant 0 : index -// STORE-ND-DAG: %[[C1:.+]] = arith.constant 1 : index -// STORE-ND-DAG: %[[C2:.+]] = arith.constant 2 : index -// STORE-ND-DAG: %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]] -// STORE-ND-DAG: %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]] -// STORE-ND-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] -// STORE-ND: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] -// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// STORE-ND-SAME: , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] -// STORE-ND-SAME: memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32 -// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// STORE-ND-SAME: %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0] +// STORE-ND: {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]] +// STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]] +// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32> // STORE-SCATTER-LABEL: @store_dynamic_source( // STORE-SCATTER-SAME: %[[VEC:.+]]: vector<8x16xf32>, @@ -126,9 +121,9 @@ gpu.func @store_out_of_bounds(%vec: vector<8x16xf32>, // STORE-ND-SAME: %[[SRC:.+]]: memref<7x64xf32>, // STORE-ND-SAME: %[[OFFSET:.+]]: index // STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// STORE-ND-SAME: %[[SRC]][%[[OFFSET]], %[[OFFSET]]] +// STORE-ND-SAME: %[[SRC]] // STORE-ND-SAME: memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32> -// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> +// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32> // STORE-SCATTER-LABEL: @store_out_of_bounds( // STORE-SCATTER: vector.transfer_write @@ -298,13 +293,13 @@ gpu.func @store_to_subview(%vec: vector<8xf16>, // STORE-ND-SAME: %[[VEC:.+]]: vector<8xf16>, // STORE-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, // STORE-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index -// STORE-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] -// STORE-ND-SAME: : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> +// STORE-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> +// STORE-ND: %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0] // STORE-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// STORE-ND-SAME: %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]] -// STORE-ND-SAME: memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16, +// STORE-ND-SAME: %[[COLLAPSED]] +// STORE-ND-SAME: memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16, // STORE-ND-SAME: boundary_check = false -// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf16> +// STORE-ND: xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF2]]] : vector<8xf16> // STORE-SCATTER-LABEL: @store_to_subview( // STORE-SCATTER-SAME: %[[VEC:.+]]: vector<8xf16>, diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir index b616632a6fe24..b062736575ad7 100644 --- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir @@ -243,6 +243,106 @@ func.func @vecdim_reduction_ori(%in: memref<256x512xi32>, %out: memref<256xi32>) // CHECK: affine.store %[[final_red]], %{{.*}} : memref<256xi32> // CHECK: } +// ----- + +func.func @vecdim_reduction_xori(%in: memref<256x512xi32>, %out: memref<256xi32>) { + %cst = arith.constant 0 : i32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) { + %ld = affine.load %in[%i, %j] : memref<256x512xi32> + %xor = arith.xori %red_iter, %ld : i32 + affine.yield %xor : i32 + } + affine.store %final_red, %out[%i] : memref<256xi32> + } + return +} + +// CHECK-LABEL: func.func @vecdim_reduction_xori( +// CHECK-SAME: %[[input:.*]]: memref<256x512xi32>, +// CHECK-SAME: %[[output:.*]]: memref<256xi32>) { +// CHECK: %[[cst:.*]] = arith.constant 0 : i32 +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vzero:.*]] = arith.constant dense<0> : vector<128xi32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xi32>) { +// CHECK: %[[poison:.*]] = ub.poison : i32 +// CHECK: %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xi32>, vector<128xi32> +// CHECK: %[[xor:.*]] = arith.xori %[[red_iter]], %[[ld]] : vector<128xi32> +// CHECK: affine.yield %[[xor]] : vector<128xi32> +// CHECK: } +// CHECK: %[[final_red:.*]] = vector.reduction <xor>, %[[vred]] : vector<128xi32> into i32 +// CHECK: affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xi32> +// CHECK: } +// CHECK: return +// CHECK: } + +// ----- + +func.func @vecdim_reduction_minnumf(%in: memref<256x512xf32>, %out: memref<256xf32>) { + %cst = arith.constant 0xFF800000 : f32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { + %ld = affine.load %in[%i, %j] : memref<256x512xf32> + %min = arith.minnumf %red_iter, %ld : f32 + affine.yield %min : f32 + } + affine.store %final_red, %out[%i] : memref<256xf32> + } + return +} + +// CHECK-LABEL: func.func @vecdim_reduction_minnumf( +// CHECK-SAME: %[[input:.*]]: memref<256x512xf32>, +// CHECK-SAME: %[[output:.*]]: memref<256xf32>) { +// CHECK: %[[cst:.*]] = arith.constant 0xFF800000 : f32 +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vzero:.*]] = arith.constant dense<0x7FC00000> : vector<128xf32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { +// CHECK: %[[poison:.*]] = ub.poison : f32 +// CHECK: %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xf32>, vector<128xf32> +// CHECK: %[[min:.*]] = arith.minnumf %[[red_iter]], %[[ld]] : vector<128xf32> +// CHECK: affine.yield %[[min]] : vector<128xf32> +// CHECK: } +// CHECK: %[[red_scalar:.*]] = vector.reduction <minnumf>, %[[vred]] : vector<128xf32> into f32 +// CHECK: %[[final_red:.*]] = arith.minnumf %[[red_scalar]], %[[cst]] : f32 +// CHECK: affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xf32> +// CHECK: } +// CHECK: return +// CHECK: } + +// ----- + +func.func @vecdim_reduction_maxnumf(%in: memref<256x512xf32>, %out: memref<256xf32>) { + %cst = arith.constant 0xFF800000 : f32 + affine.for %i = 0 to 256 { + %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) { + %ld = affine.load %in[%i, %j] : memref<256x512xf32> + %max = arith.maxnumf %red_iter, %ld : f32 + affine.yield %max : f32 + } + affine.store %final_red, %out[%i] : memref<256xf32> + } + return +} + +// CHECK-LABEL: func.func @vecdim_reduction_maxnumf( +// CHECK-SAME: %[[input:.*]]: memref<256x512xf32>, +// CHECK-SAME: %[[output:.*]]: memref<256xf32>) { +// CHECK: %[[cst:.*]] = arith.constant 0xFF800000 : f32 +// CHECK: affine.for %{{.*}} = 0 to 256 { +// CHECK: %[[vzero:.*]] = arith.constant dense<0xFFC00000> : vector<128xf32> +// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) { +// CHECK: %[[poison:.*]] = ub.poison : f32 +// CHECK: %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xf32>, vector<128xf32> +// CHECK: %[[max:.*]] = arith.maxnumf %[[red_iter]], %[[ld]] : vector<128xf32> +// CHECK: affine.yield %[[max]] : vector<128xf32> +// CHECK: } +// CHECK: %[[red_scalar:.*]] = vector.reduction <maxnumf>, %[[vred]] : vector<128xf32> into f32 +// CHECK: %[[final_red:.*]] = arith.maxnumf %[[red_scalar]], %[[cst]] : f32 +// CHECK: affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xf32> +// CHECK: } +// CHECK: return +// CHECK: } // ----- diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir index aaf9f8024bfbe..49b6342aea538 100644 --- a/mlir/test/Dialect/LLVMIR/invalid.mlir +++ b/mlir/test/Dialect/LLVMIR/invalid.mlir @@ -664,21 +664,21 @@ func.func @zero_non_llvm_type() { // ----- func.func @nvvm_invalid_shfl_pred_1(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) { - // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}} + // expected-error@+1 {{expected return type to be a two-element struct}} %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> i32 } // ----- func.func @nvvm_invalid_shfl_pred_2(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) { - // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}} + // expected-error@+1 {{expected return type to be a two-element struct}} %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32)> } // ----- func.func @nvvm_invalid_shfl_pred_3(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) { - // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}} + // expected-error@+1 {{expected second element in the returned struct to be of type 'i1' but got 'i32' instead}} %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i32)> } diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir index 2505e56407c2b..cd7bd37da5763 100644 --- a/mlir/test/Dialect/LLVMIR/nvvm.mlir +++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir @@ -445,8 +445,8 @@ llvm.func private @mbarrier_arrive(%barrier: !llvm.ptr) { } llvm.func private @mbarrier_arrive_shared(%barrier: !llvm.ptr<3>) { - // CHECK: nvvm.mbarrier.arrive.shared %{{.*}} : !llvm.ptr<3> - %0 = nvvm.mbarrier.arrive.shared %barrier : !llvm.ptr<3> -> i64 + // CHECK: nvvm.mbarrier.arrive %{{.*}} : !llvm.ptr<3> + %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr<3> -> i64 llvm.return } @@ -459,8 +459,8 @@ llvm.func private @mbarrier_arrive_nocomplete(%barrier: !llvm.ptr) { llvm.func private @mbarrier_arrive_nocomplete_shared(%barrier: !llvm.ptr<3>) { %count = nvvm.read.ptx.sreg.ntid.x : i32 - // CHECK: nvvm.mbarrier.arrive.nocomplete.shared %{{.*}} : !llvm.ptr<3> - %0 = nvvm.mbarrier.arrive.nocomplete.shared %barrier, %count : !llvm.ptr<3>, i32 -> i64 + // CHECK: nvvm.mbarrier.arrive.nocomplete %{{.*}} : !llvm.ptr<3> + %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr<3>, i32 -> i64 llvm.return } @@ -472,8 +472,8 @@ llvm.func private @mbarrier_test_wait(%barrier: !llvm.ptr, %token : i64) -> i1 { llvm.func private @mbarrier_test_wait_shared(%barrier: !llvm.ptr<3>, %token : i64) { %count = nvvm.read.ptx.sreg.ntid.x : i32 - // CHECK: nvvm.mbarrier.test.wait.shared %{{.*}} - %isComplete = nvvm.mbarrier.test.wait.shared %barrier, %token : !llvm.ptr<3>, i64 -> i1 + // CHECK: nvvm.mbarrier.test.wait %{{.*}} + %isComplete = nvvm.mbarrier.test.wait %barrier, %token : !llvm.ptr<3>, i64 -> i1 llvm.return } diff --git a/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir b/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir new file mode 100644 index 0000000000000..35f5e1b3c8ba2 --- /dev/null +++ b/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir @@ -0,0 +1,90 @@ +// RUN: mlir-opt %s -split-input-file -verify-diagnostics + +// Test invalid target architecture (sm_100 instead of sm_100a) +gpu.module @invalid_arch_sm_100 [#nvvm.target<chip = "sm_100">] { + func.func @convert_rs() { + %f1 = llvm.mlir.constant(1.0 : f32) : f32 + %f2 = llvm.mlir.constant(2.0 : f32) : f32 + %rbits = llvm.mlir.constant(0x12345678 : i32) : i32 + // expected-error@+1 {{'nvvm.convert.f32x2.to.f16x2' op is not supported on sm_100}} + %res = nvvm.convert.f32x2.to.f16x2 %f1, %f2, %rbits : vector<2xf16> + return + } +} + +// ----- + +// Test that operations require stochastic rounding mode +llvm.func @invalid_rnd_mode_f16x2(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> { + // expected-error@+1 {{Only RS rounding mode is supported for conversions from f32x2 to f16x2.}} + %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xf16> + llvm.return %res : vector<2xf16> +} + +// ----- + +llvm.func @invalid_rnd_mode_bf16x2(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> { + // expected-error@+1 {{Only RS rounding mode is supported for conversions from f32x2 to bf16x2.}} + %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16> + llvm.return %res : vector<2xbf16> +} + +// ----- + +// Test invalid destination types for f8x4 (should only accept f8E4M3FN, f8E5M2) +llvm.func @invalid_dst_type_f8x4_e3m4(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // expected-error@+1 {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f32x4 to f8x4.}} + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E3M4) + llvm.return %res : vector<4xi8> +} + +// ----- + +llvm.func @invalid_dst_type_f8x4_e8m0(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // expected-error@+1 {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f32x4 to f8x4.}} + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E8M0FNU) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test invalid destination types for f6x4 (should only accept f6E2M3FN, f6E3M2FN) +llvm.func @invalid_dst_type_f6x4_f8(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // expected-error@+1 {{Only 'f6E2M3FN' and 'f6E3M2FN' types are supported for conversions from f32x4 to f6x4.}} + %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E4M3FN) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test invalid destination type for f4x4 (should only accept f4E2M1FN) +llvm.func @invalid_dst_type_f4x4_f6(%src : vector<4xf32>, %rbits : i32) -> i16 { + // expected-error@+1 {{Only 'f4E2M1FN' type is supported for conversions from f32x4 to f4x4.}} + %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits : vector<4xf32> -> i16 (f6E2M3FN) + llvm.return %res : i16 +} + +// ----- + +// Test invalid rounding modes for non-stochastic ops +llvm.func @convert_float_to_tf32_rs_not_supported(%src : f32) -> i32 { + // expected-error @below {{Only {rn,rz,rna} rounding modes supported for ConvertFloatToTF32Op.}} + %res = nvvm.convert.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rs>} + llvm.return %res : i32 +} + +// ----- + +llvm.func @convert_f32x2_to_f8x2_rs_not_supported(%a : f32, %b : f32) { + // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}} + %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rs>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E4M3FN) + llvm.return +} + +// ----- + +llvm.func @convert_bf16x2_to_f8x2_rs_not_supported(%src : vector<2xbf16>) { + // expected-error @below {{Only RZ and RP rounding modes are supported for conversions from bf16x2 to f8x2.}} + %res = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rs>} : vector<2xbf16> -> i16 (f8E8M0FNU) + llvm.return +} diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index e703600c71c8e..5e857599b65ea 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -664,6 +664,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { llvm.return } +llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { + // CHECK-LABEL @rocdl.global.load.async.to.lds + // CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0 + // CHECK: rocdl.global.load.async.to.lds.b32 %{{.*}}, %{{.*}}, 0, 0 + // CHECK: rocdl.global.load.async.to.lds.b64 %{{.*}}, %{{.*}}, 0, 0 + // CHECK: rocdl.global.load.async.to.lds.b128 %{{.*}}, %{{.*}}, 0, 0 + rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : <1>, <3> + rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : <1>, <3> + rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : <1>, <3> + rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : <1>, <3> + llvm.return +} + // CHECK-LABEL @rocdl.tensor.load.to.lds llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>, %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) { diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir index 26b63fbe182ea..0e75894eaeceb 100644 --- a/mlir/test/Dialect/OpenACC/invalid.mlir +++ b/mlir/test/Dialect/OpenACC/invalid.mlir @@ -492,6 +492,15 @@ func.func @fct1(%0 : !llvm.ptr) -> () { // ----- +%i1 = arith.constant 1 : i32 +%i2 = arith.constant 10 : i32 +// expected-error@+1 {{unstructured acc.loop must not have induction variables}} +acc.loop control(%iv : i32) = (%i1 : i32) to (%i2 : i32) step (%i1 : i32) { + acc.yield +} attributes {independent = [#acc.device_type<none>], unstructured} + +// ----- + // expected-error@+1 {{expect at least one of num, dim or static values}} acc.loop gang({}) { "test.openacc_dummy_op"() : () -> () diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir index 042ee2503cb95..df8ab9b7dd239 100644 --- a/mlir/test/Dialect/OpenACC/ops.mlir +++ b/mlir/test/Dialect/OpenACC/ops.mlir @@ -2143,6 +2143,20 @@ func.func @acc_loop_container() { // ----- +func.func @acc_unstructured_loop() { + acc.loop { + acc.yield + } attributes {independent = [#acc.device_type<none>], unstructured} + return +} + +// CHECK-LABEL: func.func @acc_unstructured_loop +// CHECK: acc.loop +// CHECK: acc.yield +// CHECK: } attributes {independent = [#acc.device_type<none>], unstructured} + +// ----- + // Test private recipe with data bounds for array slicing acc.private.recipe @privatization_memref_slice : memref<10x10xf32> init { ^bb0(%arg0: memref<10x10xf32>, %bounds0: !acc.data_bounds_ty, %bounds1: !acc.data_bounds_ty): diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index ebbe3ce0ec0d0..92f353717ac59 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -451,7 +451,7 @@ func.func @store_scatter_offset_wi_1(%src: memref<?xf16>) { %offsets = arith.constant dense<[0]> : vector<1xindex> %mask = arith.constant dense<1>: vector<1xi1> // expected-error@+1 {{Mask should match value except the chunk size dim}} - xegpu.store %val, %src[%offsets], %mask + xegpu.store %val, %src[%offsets], %mask : vector<4xf16>, memref<?xf16>, vector<1xindex>, vector<1xi1> return } @@ -870,14 +870,6 @@ func.func @load_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>) { return } -// ----- -func.func @load_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>) { - // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}} - %data2 = xegpu.load_matrix %arg0[8, 8] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16> -> vector<16x16xf16> - return -} - - // ----- func.func @store_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf32>) { // expected-error@+1 {{failed to verify that all of {mem_desc, data} have same element type}} @@ -900,16 +892,25 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve } // ----- -func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) { - // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}} - xegpu.store_matrix %data, %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16> +func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>, %arg1: vector<2x16xf32>) { + // expected-error@+1 {{With subgroup_block_io, accessed data must be contiguous and coalesced}} + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : + vector<2x16xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>> return } // ----- -func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) { - // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}} - xegpu.store_matrix %data, %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16> +func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>, %arg1: vector<16x2xf32>) { + // expected-error@+1 {{With subgroup_block_io, the distributed dimensions must be contiguous}} + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} : + vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>> return } +// ----- +func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>, %arg1: vector<16x2xf32>) { + // expected-error@+1 {{With subgroup_block_io, the block shape must match the lane layout}} + xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : + vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>> + return +} diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index 0a10f6814ae96..9b3829664108d 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -278,6 +278,15 @@ gpu.func @subgroup_load_nd_offset_1(%src: memref<24x32xf32>, %x : index, %y : in gpu.return } +// CHECK: func @subgroup_load_nd_offset_2(%[[arg0:.*]]: memref<24x32xf32>, %arg1: index) { +gpu.func @subgroup_load_nd_offset_2(%src: memref<24x32xf32>, %x : index) { + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> + %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][%arg1, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32> + %2 = xegpu.load_nd %1[%x, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32> + gpu.return +} + // CHECK: func @simt_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) { gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32> diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir new file mode 100644 index 0000000000000..24a0de6ed48a5 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir @@ -0,0 +1,280 @@ +// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \ +// RUN: --xegpu-optimize-block-loads --canonicalize --cse --split-input-file %s | FileCheck %s + +// CHECK-LABEL: gpu.func @no_scf( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xf16>, %{{.*}}: vector<8x16xf16>) -> vector<8x16xf32> { +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[C32:.*]] = arith.constant 32 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xf16> -> index +// CHECK: %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C32]]], strides : [%[[C32]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK-NEXT: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}, %[[C16]]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[B]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xi32> to vector<16x16xf16> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]> +gpu.module @xevm_module { +gpu.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x16xf32> { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %1 = xegpu.load_nd %0[%c0, %c32] { result_layout = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #a } : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> + gpu.return %6 : vector<8x16xf32> +} +} + +// ----- +// CHECK-LABEL: gpu.func @no_scf_i8( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xi8>, %{{.*}}: vector<8x32xi8>) -> vector<8x16xi32> { +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xi8> -> index +// CHECK: %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C16]]], strides : [%[[C16]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T1]][%{{.*}}, %[[C16]]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> +// CHECK: %[[T3:.*]] = vector.bitcast %[[T2]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]>} : vector<16x8xi32> to vector<16x32xi8> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]> +#c = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +gpu.module @xevm_module { +gpu.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8x16xi32> { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xi8> -> !xegpu.tensor_desc<16x32xi8, #b> + %1 = xegpu.load_nd %0[%c0, %c64] { result_layout = #b } : !xegpu.tensor_desc<16x32xi8, #b> -> vector<16x32xi8> + %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x32xi8> to vector<32x16xi8> + %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #c } : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32> + gpu.return %6 : vector<8x16xi32> +} +} + + +// ----- +// CHECK-LABEL: gpu.func @gemm_b_transpose( +// CHECK-SAME: %{{.*}} memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[C256:.*]] = arith.constant 256 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%c128, 1] +// CHECK-SAME: : i64 -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { +// CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index +// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : +// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> +// CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} +// CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]> +gpu.module @xevm_module { +gpu.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c256 = arith.constant 256 : index + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) { + %5 = xegpu.load_nd %2[%c0, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16> + %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %7 = vector.transpose %6, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %8 : vector<8x16xf32> + } {layout_result_0 = #a} + xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @nested_scf( +// CHECK-SAME: %{{.*}}: memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C16:.*]] = arith.constant 16 : index +// CHECK: %[[C256:.*]] = arith.constant 256 : index +// CHECK: scf.for %{{.*}} to %{{.*}} step %{{.*}} { +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK: %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) { +// CHECK: %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index +// CHECK-NEXT: %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]] {layout_result_0 = #xegpu.layout< +// CHECK-SAME: lane_layout = [16, 1], lane_data = [1, 1]>} : +// CHECK-SAME: !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32> +// CHECK-NEXT: %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} +// CHECK-SAME: : vector<16x8xi32> to vector<16x16xf16> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]> +gpu.module @xevm_module { +gpu.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c256 = arith.constant 256 : index + scf.for %arg8 = %c0 to %c256 step %c16 { + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%arg8, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b> + %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) { + %5 = xegpu.load_nd %2[%arg8, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16> + %6 = xegpu.load_nd %3[%arg8, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16> + %7 = vector.transpose %6, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %8 : vector<8x16xf32> + } {layout_result_0 = #a} + xegpu.store_nd %4, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + } + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @large_loads( +// CHECK-SAME: %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C8:.*]] = arith.constant 8 : index +// CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<32x16xi32> +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 +// CHECK-SAME: -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { +// CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index +// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32> +// CHECK: %[[T7:.*]] = vector.insert_strided_slice %[[T6]], %[[CST]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 0], strides = [1, 1]} +// CHECK-SAME: : vector<32x8xi32> into vector<32x16xi32> +// CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index +// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32> +// CHECK: %[[T10:.*]] = vector.insert_strided_slice %[[T9]], %[[T7]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 8], strides = [1, 1]} +// CHECK-SAME: : vector<32x8xi32> into vector<32x16xi32> +// CHECK: %{{.*}} = vector.bitcast %[[T10]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} +// CHECK-SAME: : vector<32x16xi32> to vector<32x32xf16> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]> +gpu.module @xevm_module { +gpu.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<32x32xf16, #b> + %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1) + -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { + %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16> + %7 = vector.extract_strided_slice %6 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %8 = vector.extract_strided_slice %6 {offsets = [0, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %9 = vector.extract_strided_slice %6 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %10 = vector.extract_strided_slice %6 {offsets = [16, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x32xf16> to vector<16x16xf16> + %11 = vector.transpose %7, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %12 = vector.transpose %8, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %13 = vector.transpose %9, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %14 = vector.transpose %10, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32> + } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a} + xegpu.store_nd %4#0, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#1, %0[%c0, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#2, %0[%c16, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#3, %0[%c16, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + gpu.return +} +} + +// ----- +// CHECK-LABEL: gpu.func @array_length( +// CHECK-SAME: %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %arg2: memref<256x256xf32>) { +// CHECK: %[[C128:.*]] = arith.constant 128 : index +// CHECK: %[[C8:.*]] = arith.constant 8 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index +// CHECK: %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64 +// CHECK: %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 -> +// CHECK-SAME: !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> +// CHECK: %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { +// CHECK: %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index +// CHECK: %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32> +// CHECK: %[[T7:.*]] = vector.bitcast %[[T6]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} +// CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16> +// CHECK: %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index +// CHECK: %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} +// CHECK-SAME: : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32> +// CHECK: %[[T10:.*]] = vector.bitcast %[[T9]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} +// CHECK-SAME: : vector<32x8xi32> to vector<32x16xf16> +#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]> +#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> +#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]> +gpu.module @xevm_module { +gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a> + %1 = xegpu.load_nd %0[%c0, %c0] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32> + %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> + -> !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>> + %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1) + -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { + %6 = xegpu.load_nd %3[%c0, %arg3] { layout_result_0 = #b } + : !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x32x16xf16> + %19 = vector.extract %6[0] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16> + %20 = vector.extract %6[1] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16> + %7 = vector.extract_strided_slice %19 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %8 = vector.extract_strided_slice %19 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %9 = vector.extract_strided_slice %20 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %10 = vector.extract_strided_slice %20 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b } + : vector<32x16xf16> to vector<16x16xf16> + %11 = vector.transpose %7, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %12 = vector.transpose %8, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %13 = vector.transpose %9, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %14 = vector.transpose %10, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16> + %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32> + } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a} + xegpu.store_nd %4#0, %0[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#1, %0[%c0, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#2, %0[%c16, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + xegpu.store_nd %4#3, %0[%c16, %c16] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a> + gpu.return +} +} diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 543e119d81d88..61e315d0d2080 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -215,6 +215,46 @@ func.func @scatter_ops(%src: memref<256xf16>) { } // ----- gpu.module @test { +// CHECK-LABEL: func.func @scatter_ops_custom_perm_layout( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { +// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> +// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> +// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16> +// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] +// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) { + %1 = arith.constant dense<1>: vector<16xi1> + %offset = arith.constant dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> + %4 = arith.addf %3, %3 : vector<16xf16> + xegpu.store %4, %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + return +} +} +// ----- +gpu.module @test { +// CHECK-LABEL: func.func @scatter_ops_preserve_load_perm_layout( +// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { +// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1> +// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex> +// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> +// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16> +// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]] +// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> +func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) { + %1 = arith.constant dense<1>: vector<16xi1> + %offset = arith.constant dense<12> : vector<16xindex> + %3 = xegpu.load %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> + %4 = arith.addf %3, %3 : vector<16xf16> + xegpu.store %4, %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> + return +} +} +// ----- +gpu.module @test { // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16( // CHECK: %[[LOAD0:.*]] = xegpu.load_nd %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} // CHECK-SAME: !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16> diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 27a3dc373c739..8fd3cca5594cb 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -265,3 +265,68 @@ gpu.module @xevm_module{ gpu.return } } + +// ----- +// CHECK-LABEL: gpu.func @load_store_matrix_1({{.*}}) { +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C8:.*]] = arith.constant 8 : index +// CHECK: %[[LANE_ID:.*]] = gpu.lane_id +// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C8]] +// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C8]] +// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C2]] +// CHECK: %[[REMU3:.*]] = index.remu %[[REMU2]], %[[C2]] +// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C8]] +// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[REMU4]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32> +// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[REMU4]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index +gpu.module @xevm_module{ + gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) { + %c0 = arith.constant 0 : index + %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32> + xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @load_store_matrix_2({{.*}}) { +// CHECK: %[[C8:.*]] = arith.constant 8 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C4:.*]] = arith.constant 4 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[LANE_ID:.*]] = gpu.lane_id +// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C4]] +// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C4]] +// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C4]] +// CHECK: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2]] +// CHECK: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C8]] +// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C4]] +// CHECK: %[[ADD:.*]] = index.add %[[REMU4]], %[[C1]] +// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[ADD]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32> +// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[ADD]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index +gpu.module @xevm_module{ + gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32> + xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index + gpu.return + } +} + +// ----- +// CHECK-LABEL: gpu.func @load_store_matrix_3({{.*}}) { +// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>: +// CHECK-SAME: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index -> vector<1x2xf32> +// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>: +// CHECK-SAME: vector<1x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index +gpu.module @xevm_module{ + gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : + !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index -> vector<16x2xf32> + xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : + vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index + gpu.return + } +} diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir index b73bc69393dab..02c5f71d5c83d 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir @@ -1,33 +1,32 @@ // RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s -//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)> gpu.module @test { gpu.func @slice_attr() -> vector<128xindex> { - //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]] - //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] - //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] - //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> - //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> - //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C8:.*]] + // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU]], %[[C4:.*]] + // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]] + // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]] + // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex> + // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex> + // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex> %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex> gpu.return %step : vector<128xindex> } gpu.func @nested_slice_attr() -> vector<128xindex> { - //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]] - //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] - //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] - //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> - //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> - //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[DIVU1:.*]] = index.divu %[[SGID]], %[[C1:.*]] + // CHECK-DAG: %[[DIVU2:.*]] = index.divu %[[DIVU1]], %[[C8:.*]] + // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU2]], %[[C4:.*]] + // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]] + // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]] + // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex> + // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex> + // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex> %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 1], sg_data = [32, 32, 1]>, dims = [2]>, dims = [1]>} : vector<128xindex> gpu.return %0 : vector<128xindex> } -} \ No newline at end of file +} + diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir index 09df1e4da43e2..9580769d37313 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir @@ -166,14 +166,12 @@ gpu.module @test_elementwise_ops { %load_b = xegpu.load_nd %tdesc_b : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>> -> vector<24x32xf32> - // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} - // CHECK-SAME-COUNT-12: : vector<2x2xf32> + // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32> // CHECK-NOT: arith.negf %negf = arith.negf %load_a {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>} : vector<24x32xf32> - // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} - // CHECK-SAME-COUNT-12: : vector<2x2xf32> + // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32> // CHECK-NOT: math.powf %powf = math.powf %load_a, %load_b {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>} diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index d2d250cbe0f66..01134d8eaabec 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -1,14 +1,10 @@ // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s -#map = affine_map<()[s0] -> (s0 floordiv 4)> -#map1 = affine_map<()[s0] -> (s0 mod 4)> - gpu.module @test_round_robin_assignment { // CHECK-LABEL: create_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32> - // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK-NOT: xegpu.create_nd_tdesc %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -16,22 +12,23 @@ gpu.module @test_round_robin_assignment { } // CHECK-LABEL: create_nd_tdesc_with_shared_data - // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32> + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) { - //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK: [[IdY:%.+]] = affine.apply #map()[[[sgId]]] - //CHECK: [[IdX:%.+]] = affine.apply #map1()[[[sgId]]] - //CHECK: [[C16:%.+]] = arith.constant 16 : index - //CHECK: [[LY:%.+]] = index.mul [[IdY]], [[C16]] - //CHECK: [[C64:%.+]] = arith.constant 64 : index - //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]] - //CHECK: [[C0:%.+]] = arith.constant 0 : index - //CHECK: [[C0_1:%.+]] = arith.constant 0 : index - //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[offY:%.+]] = index.remu [[LY]], [[C128]] - //CHECK: [[C64_2:%.+]] = arith.constant 64 : index - //CHECK: [[offX:%.+]] = index.remu [[LX]], [[C64_2]] - //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32> + // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK: %[[C4:.*]] = arith.constant 4 : index + // CHECK: %[[IDX:.*]] = index.remu %[[SGID]], %[[C4]] + // CHECK: %[[IDY_DIV:.*]] = index.divu %[[SGID]], %[[C4]] + // CHECK: %[[C8:.*]] = arith.constant 8 : index + // CHECK: %[[IDY:.*]] = index.remu %[[IDY_DIV]], %[[C8]] + // CHECK: %[[C16:.*]] = arith.constant 16 : index + // CHECK: %[[LY:.*]] = index.mul %[[IDY]], %[[C16]] + // CHECK: %[[C64:.*]] = arith.constant 64 : index + // CHECK: %[[LX:.*]] = index.mul %[[IDX]], %[[C64]] + // CHECK: %[[C128:.*]] = arith.constant 128 : index + // CHECK: %[[OFFY:.*]] = index.remu %[[LY]], %[[C128]] + // CHECK: %[[C64_1:.*]] = arith.constant 64 : index + // CHECK: %[[OFFX:.*]] = index.remu %[[LX]], %[[C64_1]] + // CHECK: xegpu.create_nd_tdesc %[[ARG_0]][%[[OFFY]], %[[OFFX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>> gpu.return @@ -42,9 +39,7 @@ gpu.module @test_round_robin_assignment { gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) { %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-COUNT-4: xegpu.load_nd %{{.*}} - // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-SAME-COUNT-4: -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32> // CHECK-NOT: xegpu.load_nd %load = xegpu.load_nd %tdesc : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -57,9 +52,8 @@ gpu.module @test_round_robin_assignment { gpu.func @store_nd(%src: memref<256x128xf32>) { %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} - // CHECK-SAME-COUNT-4: : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-NOT : xegpu.store_nd + // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-NOT: xegpu.store_nd %load = xegpu.load_nd %tdesc : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<256x128xf32> @@ -73,8 +67,7 @@ gpu.module @test_round_robin_assignment { gpu.func @update_nd(%src: memref<256x128xf32>){ %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16] - // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>> + // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK-NOT: xegpu.update_nd_offset %update = xegpu.update_nd_offset %tdesc, [0, 16] : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -84,15 +77,9 @@ gpu.module @test_round_robin_assignment { // CHECK-LABEL: dpas // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>) gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> - // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-NOT: xegpu.create_nd_tdesc - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> - // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [4, 8], lane_data = [1, 1]>> - // CHECK-NOT: xegpu.create_nd_tdesc - // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} - // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} - // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> + // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> // CHECK-NOT: xegpu.dpas %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16> -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -113,8 +100,7 @@ gpu.module @test_round_robin_assignment { // CHECK-LABEL: prefetch_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}} - // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK-NOT: xegpu.prefetch_nd %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -131,9 +117,7 @@ gpu.module @test_round_robin_assignment { %load = xegpu.load_nd %tdesc : !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>> -> vector<128x1xf32> - // CHECK-COUNT-2: vector.broadcast {{.*}} - // CHECK-SAME-COUNT-2: {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} - // CHECK-SAME-COUNT-2: : vector<16x1xf32> to vector<16x32xf32> + // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} : vector<16x1xf32> to vector<16x32xf32> // CHECK-NOT: vector.broadcast %broadcast = vector.broadcast %load {layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>} @@ -171,10 +155,10 @@ gpu.module @test_round_robin_assignment { %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32> %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> - //CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32) + // CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32) %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) { %4 = arith.cmpi slt, %arg3, %c10_i32 : i32 - //CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32 + // CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32 scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32 } do { // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: vector<16xf32>, [[arg4:%.+]]: i32) @@ -195,16 +179,16 @@ gpu.module @test_round_robin_assignment { %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> %3 = arith.cmpi eq, %0, %c10 : index // CHECK-LABEL: scf.if - // CHECK-SAME: (vector<16xf32>, vector<16xf32>) + // CHECK-SAME: (vector<16xf32>, vector<16xf32>) %4 = scf.if %3 -> (vector<256xf32>) { %5 = xegpu.load_nd %1 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32> // CHECK-LABEL: scf.yield - // CHECK-SAME: vector<16xf32>, vector<16xf32> + // CHECK-SAME: vector<16xf32>, vector<16xf32> scf.yield %5 : vector<256xf32> } else { %5 = xegpu.load_nd %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32> // CHECK-LABEL: scf.yield - // CHECK-SAME: vector<16xf32>, vector<16xf32> + // CHECK-SAME: vector<16xf32>, vector<16xf32> scf.yield %5 : vector<256xf32> } {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [16]>} xegpu.store_nd %4, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> @@ -220,16 +204,16 @@ gpu.module @test_round_robin_assignment { %0 = arith.cmpi eq, %id, %c10 : index // CHECK-LABEL: scf.if - // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>) + // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>) %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) { %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> // CHECK-LABEL: scf.yield - // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> + // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> } else { %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> // CHECK-LABEL: scf.yield - // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> + // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32> scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> } xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> @@ -238,8 +222,8 @@ gpu.module @test_round_robin_assignment { gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) { %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> - //CHECK-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32> - //CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32> + // CHECK-COUNT-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32> + // CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32> %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32> %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>, target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir index 86a021b66949c..84ce80f477a55 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir @@ -14,13 +14,11 @@ gpu.module @test_distribution { // CHECK-LABEL: load_nd_tdesc_with_offset gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] - // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-SAME-COUNT-4: -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32> // CHECK-NOT: xegpu.load_nd %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> - %load = xegpu.load_nd %tdesc[0, 0] + %load = xegpu.load_nd %tdesc[0, 0] : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<256x128xf32> gpu.return @@ -28,8 +26,7 @@ gpu.module @test_distribution { // CHECK-LABEL: store_nd_with_offset gpu.func @store_nd_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] - // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK-NOT: xegpu.store_nd %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -42,10 +39,8 @@ gpu.module @test_distribution { } // CHECK-LABEL: prefetch_nd_tdesc_with_offset - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] - // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> // CHECK-NOT: xegpu.prefetch_nd %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -57,15 +52,11 @@ gpu.module @test_distribution { // CHECK-LABEL: dpas // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>) gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) { - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> - // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK-NOT: xegpu.create_nd_tdesc - // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> - // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [4, 8], lane_data = [1, 1]>> - // CHECK-NOT: xegpu.create_nd_tdesc - // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} - // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} - // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16> + // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> + // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16> + // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> // CHECK-NOT: xegpu.dpas %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16> -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -102,27 +93,42 @@ gpu.module @test_distribution { gpu.func @non_splat_constant() { // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}}> : vector<2x1xindex> // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[MAP4:.*]] = affine.apply #map4()[%[[SGID]]] - // CHECK-DAG: %[[MAP5:.*]] = affine.apply #map5()[%[[SGID]]] - // CHECK-DAG: %[[MUL:.*]] = index.mul %[[MAP4]], %[[C2:.*]] - // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[MUL]], %[[C32:.*]] - // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[MAP5]], %[[C1:.*]] + // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[SGID]], %[[C1:.*]] + // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C1:.*]] + // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C8:.*]] + // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2:.*]] + // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C32:.*]] + // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C1:.*]] // CHECK-DAG: %[[ADD16:.*]] = arith.addi %[[MUL]], %[[C16:.*]] : index - // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[ADD16]], %[[C32:.*]] - // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[MAP5]], %[[C1:.*]] - // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU1]], %[[C16:.*]] : index + // CHECK-DAG: %[[REMU5:.*]] = index.remu %[[ADD16]], %[[C32:.*]] + // CHECK-DAG: %[[REMU6:.*]] = index.remu %[[REMU1]], %[[C1:.*]] + // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[STRIDE1]] : index - // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU2]], %[[C0:.*]] : index + // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index // CHECK-DAG: %[[ADDSTRIDES1:.*]] = arith.addi %[[ADDSTRIDES]], %[[STRIDE2]] : index // CHECK-DAG: %[[BCAST1:.*]] = vector.broadcast %[[ADDSTRIDES1]] : index to vector<2x1xindex> // CHECK-DAG: %[[RESULT1:.*]] = arith.addi %[[BASECST]], %[[BCAST1]] : vector<2x1xindex> - // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index + // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU5]], %[[C16:.*]] : index // CHECK-DAG: %[[ADDSTRIDES2:.*]] = arith.addi %[[C0:.*]], %[[STRIDE3]] : index - // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index + // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU6]], %[[C0:.*]] : index // CHECK-DAG: %[[ADDSTRIDES3:.*]] = arith.addi %[[ADDSTRIDES2]], %[[STRIDE4]] : index // CHECK-DAG: %[[BCAST2:.*]] = vector.broadcast %[[ADDSTRIDES3]] : index to vector<2x1xindex> // CHECK-DAG: %[[RESULT2:.*]] = arith.addi %[[BASECST]], %[[BCAST2]] : vector<2x1xindex> %cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex> gpu.return } + + // CHECK-LABEL: vector_transpose + gpu.func @vector_transpose(%src: memref<256x128xf32>) { + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>> + -> vector<256x128xf32> + // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<32x16xf32> to vector<16x32xf32> + // CHECK-NOT: vector.transpose + %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x128xf32> to vector<128x256xf32> + gpu.return + } } + diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 52acde4dffc2e..4fbb566cfbe73 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -1,8 +1,5 @@ // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s -//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)> -//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)> -//CHECK: #map2 = affine_map<()[s0] -> (s0 floordiv 8)> gpu.module @test_distribution { // CHECK-LABEL: create_nd_tdesc_no_offset // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> @@ -26,13 +23,23 @@ gpu.module @test_distribution { } // CHECK-LABEL: load_nd_tdesc_with_offset - // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index - //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] - //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] - //CHECK: %[[LOAD:.*]] = xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32> - %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> + //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + //CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + //CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index + //CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %[[C4]] + //CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %[[C4]] + //CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index + //CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %[[C8]] + //CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index + //CHECK-DAG: %[[L_OFF_Y:.*]] = index.mul %[[SGIDY]], %[[C32]] + //CHECK-DAG: %[[L_OFF_X:.*]] = index.mul %[[SGIDX]], %[[C32]] + //CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index + //CHECK-DAG: %[[OFF_Y:.*]] = index.remu %[[L_OFF_Y]], %[[C256]] + //CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index + //CHECK-DAG: %[[OFF_X:.*]] = index.remu %[[L_OFF_X]], %[[C128]] + //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> %load = xegpu.load_nd %tdesc[0, 0] : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -43,9 +50,6 @@ gpu.module @test_distribution { // CHECK-LABEL: store_nd_with_offsets // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @store_nd_with_offsets(%src: memref<256x128xf32>) { - //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index - //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] - //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> @@ -60,9 +64,6 @@ gpu.module @test_distribution { // CHECK-LABEL: prefetch_nd_tdesc_with_offset // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) { - //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index - //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] - //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> %cst0 = arith.constant 0 : index %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32> @@ -285,7 +286,7 @@ gpu.module @test_distribution { // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16> // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex> // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1> - // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> + // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}> // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>, // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>} // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1> @@ -319,17 +320,15 @@ gpu.module @test_distribution { gpu.func @distribute_load_matrix(%arg0: memref<32768xi8, 3>) { //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index - //CHECK: [[c2:%.+]] = arith.constant 2 : index //CHECK: [[c4:%.+]] = arith.constant 4 : index - //CHECK: [[c4_0:%.+]] = arith.constant 4 : index - //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]] - //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]] + //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]] + //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]] + //CHECK: [[c2:%.+]] = arith.constant 2 : index + //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]] //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]] - //CHECK: [[c32_1:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]] - //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[c0_1:%.+]] = arith.constant 0 : index + //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]] + //CHECK: [[c32_0:%.+]] = arith.constant 32 : index + //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]] //CHECK: [[c64:%.+]] = arith.constant 64 : index //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] //CHECK: [[c128:%.+]] = arith.constant 128 : index @@ -346,17 +345,15 @@ gpu.module @test_distribution { //CHECK: [[cst:%.+]] = arith.constant dense<1.000000e+00> : vector<32x32xf32> //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index - //CHECK: [[c2:%.+]] = arith.constant 2 : index //CHECK: [[c4:%.+]] = arith.constant 4 : index - //CHECK: [[c4_0:%.+]] = arith.constant 4 : index - //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]] - //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]] + //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]] + //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]] + //CHECK: [[c2:%.+]] = arith.constant 2 : index + //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]] //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]] - //CHECK: [[c32_1:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]] - //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[c0_2:%.+]] = arith.constant 0 : index + //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]] + //CHECK: [[c32_0:%.+]] = arith.constant 32 : index + //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]] //CHECK: [[c64:%.+]] = arith.constant 64 : index //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] //CHECK: [[c128:%.+]] = arith.constant 128 : index @@ -411,14 +408,17 @@ gpu.module @test_distribution { // CHECK-LABEL: vector_step_op gpu.func @vector_step_op_slice_attr() { //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]] - //CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index - //CHECK-DAG: [[LY:%.+]] = index.mul [[IDY]], [[c32]] - //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index - //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index - //CHECK-DAG: [[MODY:%.+]] = index.remu [[LY]], [[c128]] - //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex> - //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> + //CHECK: [[c8:%.+]] = arith.constant 8 : index + //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c8]] + //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgId]], [[c8]] + //CHECK: [[c4:%.+]] = arith.constant 4 : index + //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c4]] + //CHECK: [[c32:%.+]] = arith.constant 32 : index + //CHECK: [[LY:%.+]] = index.mul [[sgidy]], [[c32]] + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[c128]] + //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> + //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex> gpu.return @@ -426,14 +426,14 @@ gpu.module @test_distribution { gpu.func @vector_step_op_layout_attr() { //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index - //CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index - //CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index - //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]] - //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index - //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index - //CHECK-DAG: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] - //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex> - //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex> + //CHECK: [[c16:%.+]] = arith.constant 16 : index + //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c16]] + //CHECK: [[c8:%.+]] = arith.constant 8 : index + //CHECK: [[LOCALY:%.+]] = index.mul [[sgidx]], [[c8]] + //CHECK: [[c128:%.+]] = arith.constant 128 : index + //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] + //CHECK: [[BASE:%.+]] = vector.step : vector<8xindex> + //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex> %step = vector.step {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [8]>}: vector<128xindex> gpu.return @@ -464,14 +464,27 @@ gpu.module @test_distribution { gpu.return } + // CHECK-LABEL: vector_transpose + gpu.func @vector_transpose(%src: memref<256x32xf32>) { + %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32> + -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>> + -> vector<256x32xf32> + //CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<64x32xf32> to vector<32x64xf32> + %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x32xf32> to vector<32x256xf32> + gpu.return + } + // CHECK-LABEL: non_splat_constant_2D gpu.func @non_splat_constant_2D() { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x1xindex> // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: affine.apply #map4()[%[[SGID]]] - // CHECK-DAG: affine.apply #map5()[%[[SGID]]] - // CHECK-DAG: %[[IDY:.*]] = index.remu %{{.*}}, %[[C32:.*]] - // CHECK-DAG: %[[IDX:.*]] = index.remu %{{.*}}, %[[C1:.*]] + // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}} + // CHECK-DAG: %[[IDY:.*]] = index.remu %[[SGIDY]], %{{.*}} + // CHECK-DAG: %[[IDX:.*]] = index.remu %[[SGIDX]], %{{.*}} // CHECK-DAG: %[[STRIDECOL:.*]] = arith.muli %[[IDY]], %[[C16:.*]] : index // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[STRIDECOL]] : index // CHECK-DAG: %[[STRIDEROW:.*]] = arith.muli %[[IDX]], %[[C0:.*]] : index @@ -484,20 +497,19 @@ gpu.module @test_distribution { // CHECK-LABEL: non_splat_constant_2D_non_unit_dim gpu.func @non_splat_constant_2D_non_unit_dim() { - // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}} : vector<2x2xindex> + // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{\[}}{{\[}}0, 16{{\]}}, {{\[}}8, 24{{\]}}{{\]}}> : vector<2x2xindex> // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[IDY:.*]] = affine.apply #map()[%[[SGID]]] - // CHECK-DAG: %[[IDX:.*]] = affine.apply #map1()[%[[SGID]]] - // CHECK-DAG: %[[MULY:.*]] = index.mul %[[IDY]], %[[C2:.*]] - // CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index - // CHECK-DAG: %[[MULX:.*]] = index.mul %[[IDX]], %[[C2:.*]] + // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}} + // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}} + // CHECK-DAG: %[[MULY:.*]] = index.mul %[[SGIDY]], %[[C2:.*]] + // CHECK-DAG: %[[MULX:.*]] = index.mul %[[SGIDX]], %{{.*}} // CHECK-DAG: %[[REMU_Y:.*]] = index.remu %[[MULY]], %[[C8:.*]] - // CHECK-DAG: %[[C8_2:.*]] = arith.constant 8 : index - // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %[[C8:.*]] - // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %[[C8:.*]] : index + // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %{{.*}} + // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %{{.*}} : index // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[MUL5]] : index // CHECK-DAG: %[[MUL6:.*]] = arith.muli %[[REMU_X]], %[[C16:.*]] : index - // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[MUL6]] : index + // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[MUL6]] : index // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<2x2xindex> // CHECK-DAG: %[[ADDCST:.*]] = arith.addi %[[BASECST]], %[[BCAST]] : vector<2x2xindex> %cst_8x8 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>} dense<[ @@ -517,13 +529,14 @@ gpu.module @test_distribution { gpu.func @non_splat_constant() { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex> // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %[[C32:.*]] - // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C16:.*]] : index + // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %{{.*}} + // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[REMU]], %{{.*}} + // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU2]], %[[C16:.*]] : index // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[MUL]] : index // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1xindex> // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[CST]], %[[BCAST]] : vector<1xindex> %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [1]>} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex> - // CHECK: arith.constant dense<{{\[}}[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]{{\]}}> : vector<1x16xindex> + // CHECK: arith.constant dense<{{\[}}{{\[}}0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15{{\]}}{{\]}}> : vector<1x16xindex> %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index e83229e3a3995..5ce3d1d0fb5d6 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -1,47 +1,35 @@ // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s -//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)> -//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)> gpu.module @test_1_1_assignment { // CHECK-LABEL: create_nd_tdesc - // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32> + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) { - //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index - //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] - //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] - //CHECK: [[C32:%.+]] = arith.constant 32 : index - //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]] - //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]] - //CHECK: [[C0:%.+]] = arith.constant 0 : index - //CHECK: [[C0_1:%.+]] = arith.constant 0 : index - //CHECK: [[C256:%.+]] = arith.constant 256 : index - //CHECK: [[Y:%.+]] = index.remu [[LY]], [[C256]] - //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[X:%.+]] = index.remu [[LX]], [[C128]] - //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]] + // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]] + // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]] + // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]] + // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]] + // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[MODY]], %[[MODX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> gpu.return } // CHECK-LABEL: create_nd_tdesc_from_higher_rank_memref - // CHECK-SAME: [[ARG_0:%.*]]: memref<3x256x128xf32> + // CHECK-SAME: %[[ARG_0:.*]]: memref<3x256x128xf32> gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) { - //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index - //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] - //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] - //CHECK: [[C32:%.+]] = arith.constant 32 : index - //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]] - //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]] - //CHECK: [[C0:%.+]] = arith.constant 0 : index - //CHECK: [[C0_2:%.+]] = arith.constant 0 : index - //CHECK: [[C256:%.+]] = arith.constant 256 : index - //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[C256]] - //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[MODX:%.+]] = index.remu [[LX]], [[C128]] - //CHECK: [[C0_3:%.+]] = arith.constant 0 : index - //CHECK: [[C0_4:%.+]] = arith.constant 0 : index - //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[MODY]], [[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]] + // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]] + // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]] + // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]] + // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]] + // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]] + // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][1, %[[MODY]], %[[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> gpu.return @@ -81,25 +69,24 @@ gpu.module @test_1_1_assignment { xegpu.store_nd %load, %tdesc : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> gpu.return -} + } -// CHECK-LABEL: update_nd -// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> -gpu.func @update_nd(%src: memref<256x128xf32>){ - // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32> - // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16] - // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> - %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> - -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> - %update = xegpu.update_nd_offset %tdesc, [0, 16] - : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> - gpu.return -} + // CHECK-LABEL: update_nd + // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> + gpu.func @update_nd(%src: memref<256x128xf32>){ + // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32> + // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16] + // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> + %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> + %update = xegpu.update_nd_offset %tdesc, [0, 16] + : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>> + gpu.return + } -// CHECK-LABEL: dpas -gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> + // CHECK-LABEL: dpas + gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>> %load_a = xegpu.load_nd %tdesc_a @@ -110,16 +97,15 @@ gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %load_b = xegpu.load_nd %tdesc_b : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<128x128xf16> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> gpu.return } - -// CHECK-LABEL: dpas_no_sg_data -gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { - // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> + // CHECK-LABEL: dpas_no_sg_data + gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>> @@ -134,6 +120,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1], order = [1, 0]>> -> vector<128x128xf16> + // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32> %dpas = xegpu.dpas %load_a, %load_b {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32> @@ -196,9 +183,9 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { } gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) { - //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[c1024:%.+]] = arith.constant 1024 : index + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index + // CHECK-DAG: %[[C1024:.*]] = arith.constant 1024 : index %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index %c1024 = arith.constant 1024 : index @@ -211,15 +198,15 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> - // CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]] - // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) -> + // CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]] + // CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) -> // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>) - // CHECK: [[a:%.+]] = xegpu.load_nd [[arg4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> - // CHECK: [[b:%.+]] = xegpu.load_nd [[arg5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> - // CHECK: [[c:%.+]] = xegpu.dpas [[a]], [[b]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32> - // CHECK: [[at:%.+]] = xegpu.update_nd_offset [[arg4]], [[[c0]], [[c128]]] : !xegpu.tensor_desc<16x128xf16> - // CHECK: [[bt:%.+]] = xegpu.update_nd_offset [[arg5]], [[[c128]], [[c0]]] : !xegpu.tensor_desc<128x16xf16> - // CHECK: scf.yield [[at]], [[bt]], [[c]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32> + // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16> + // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16> + // CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32> + // CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16> + // CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16> + // CHECK: scf.yield %[[AT]], %[[BT]], %[[C]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32> %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>, !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) { @@ -252,7 +239,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { // CHECK: scf.condition{{.*}} : vector<16xf32>, i32 scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32 } do { - // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: i32) + // CHECK: (%[[ARG2:.*]]: vector<16xf32>, %[[ARG3:.*]]: i32) ^bb0(%arg2: vector<256xf32>, %arg3: i32): xegpu.store_nd %arg2, %2 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> %4 = arith.addi %arg3, %c1_i32 : i32 @@ -344,9 +331,9 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) { %cond4 = arith.cmpi slt, %sg_id, %c31 : index %cond5 = arith.andi %cond3, %cond4 : i1 scf.if %cond5 { - // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index - // CHECK: %[[C2:.*]] = arith.constant 2 : index - // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]] + // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index + // CHECK: %[[C2:.*]] = arith.constant 2 : index + // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]] %tdesc = xegpu.create_nd_tdesc %src2[0, 0] : memref<128x64xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>> %load = xegpu.load_nd %tdesc diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir index 60bd24a27868e..1e4cf8d4589cb 100644 --- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir @@ -1308,7 +1308,7 @@ llvm.func @experimental_constrained_fpext(%s: f32, %v: vector<4xf32>) { // CHECK-DAG: declare float @llvm.cos.f32(float) // CHECK-DAG: declare <8 x float> @llvm.cos.v8f32(<8 x float>) #0 // CHECK-DAG: declare { float, float } @llvm.sincos.f32(float) -// CHECK-DAG: declare { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float>) #0 +// CHECK-DAG: declare { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float>) // CHECK-DAG: declare float @llvm.copysign.f32(float, float) // CHECK-DAG: declare float @llvm.rint.f32(float) // CHECK-DAG: declare double @llvm.rint.f64(double) diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir new file mode 100644 index 0000000000000..b5bb22350dcd7 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir @@ -0,0 +1,182 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// ----- + +// Test valid architectures work + +// Valid case on sm_100a +gpu.module @valid_f16x2_rs_sm_100a [#nvvm.target<chip = "sm_100a">] { + func.func @convert_rs() { + %f1 = llvm.mlir.constant(1.0 : f32) : f32 + %f2 = llvm.mlir.constant(2.0 : f32) : f32 + %rbits = llvm.mlir.constant(0x12345678 : i32) : i32 + %res = nvvm.convert.f32x2.to.f16x2 %f1, %f2, %rbits : vector<2xf16> + return + } +} + +// Valid case on sm_103a +gpu.module @valid_bf16x2_rs_sm_103a [#nvvm.target<chip = "sm_103a">] { + func.func @convert_rs() { + %f1 = llvm.mlir.constant(1.0 : f32) : f32 + %f2 = llvm.mlir.constant(2.0 : f32) : f32 + %rbits = llvm.mlir.constant(0 : i32) : i32 + %res = nvvm.convert.f32x2.to.bf16x2 %f1, %f2, %rbits : vector<2xbf16> + return + } +} + +// ----- + +// Test F32x2 -> F16x2 with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x2_to_f16x2_rs +llvm.func @convert_f32x2_to_f16x2_rs(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> { + // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits : vector<2xf16> + llvm.return %res : vector<2xf16> +} + +// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_satfinite +llvm.func @convert_f32x2_to_f16x2_rs_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> { + // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {sat = #nvvm.sat_mode<satfinite>} : vector<2xf16> + llvm.return %res : vector<2xf16> +} + +// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_relu +llvm.func @convert_f32x2_to_f16x2_rs_relu(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> { + // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {relu = true} : vector<2xf16> + llvm.return %res : vector<2xf16> +} + +// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_relu_satfinite +llvm.func @convert_f32x2_to_f16x2_rs_relu_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> { + // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {relu = true, sat = #nvvm.sat_mode<satfinite>} : vector<2xf16> + llvm.return %res : vector<2xf16> +} + +// ----- + +// Test F32x2 -> BF16x2 with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs +llvm.func @convert_f32x2_to_bf16x2_rs(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> { + // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits : vector<2xbf16> + llvm.return %res : vector<2xbf16> +} + +// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_satfinite +llvm.func @convert_f32x2_to_bf16x2_rs_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> { + // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> + llvm.return %res : vector<2xbf16> +} + +// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_relu +llvm.func @convert_f32x2_to_bf16x2_rs_relu(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> { + // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {relu = true} : vector<2xbf16> + llvm.return %res : vector<2xbf16> +} + +// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_relu_satfinite +llvm.func @convert_f32x2_to_bf16x2_rs_relu_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> { + // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {relu = true, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> + llvm.return %res : vector<2xbf16> +} + +// ----- + +// Test F32x4 -> F8x4 (E4M3) with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x4_to_f8x4_e4m3_rs +llvm.func @convert_f32x4_to_f8x4_e4m3_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E4M3FN) + llvm.return %res : vector<4xi8> +} + +// CHECK-LABEL: @convert_f32x4_to_f8x4_e4m3_rs_relu +llvm.func @convert_f32x4_to_f8x4_e4m3_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f8E4M3FN) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test F32x4 -> F8x4 (E5M2) with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x4_to_f8x4_e5m2_rs +llvm.func @convert_f32x4_to_f8x4_e5m2_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E5M2) + llvm.return %res : vector<4xi8> +} + +// CHECK-LABEL: @convert_f32x4_to_f8x4_e5m2_rs_relu +llvm.func @convert_f32x4_to_f8x4_e5m2_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f8E5M2) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test F32x4 -> F6x4 (E2M3) with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x4_to_f6x4_e2m3_rs +llvm.func @convert_f32x4_to_f6x4_e2m3_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f6E2M3FN) + llvm.return %res : vector<4xi8> +} + +// CHECK-LABEL: @convert_f32x4_to_f6x4_e2m3_rs_relu +llvm.func @convert_f32x4_to_f6x4_e2m3_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f6E2M3FN) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test F32x4 -> F6x4 (E3M2) with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x4_to_f6x4_e3m2_rs +llvm.func @convert_f32x4_to_f6x4_e3m2_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f6E3M2FN) + llvm.return %res : vector<4xi8> +} + +// CHECK-LABEL: @convert_f32x4_to_f6x4_e3m2_rs_relu +llvm.func @convert_f32x4_to_f6x4_e3m2_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> { + // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f6E3M2FN) + llvm.return %res : vector<4xi8> +} + +// ----- + +// Test F32x4 -> F4x4 (E2M1) with stochastic rounding (.rs) + +// CHECK-LABEL: @convert_f32x4_to_f4x4_e2m1_rs +llvm.func @convert_f32x4_to_f4x4_e2m1_rs(%src : vector<4xf32>, %rbits : i32) -> i16 { + // CHECK: %{{.*}} = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits : vector<4xf32> -> i16 (f4E2M1FN) + llvm.return %res : i16 +} + +// CHECK-LABEL: @convert_f32x4_to_f4x4_e2m1_rs_relu +llvm.func @convert_f32x4_to_f4x4_e2m1_rs_relu(%src : vector<4xf32>, %rbits : i32) -> i16 { + // CHECK: %{{.*}} = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}}) + %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits {relu = true} : vector<4xf32> -> i16 (f4E2M1FN) + llvm.return %res : i16 +} + diff --git a/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir new file mode 100644 index 0000000000000..a8a743006fbf8 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir @@ -0,0 +1,49 @@ +// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s + +// ----- + +llvm.func @redux_sync_i32_with_abs(%value: i32, %offset: i32) { + // expected-error@+1 {{abs attribute is supported only for f32 type}} + %res = nvvm.redux.sync add %value, %offset {abs = true}: i32 -> i32 + llvm.return +} + +// ----- + +llvm.func @redux_sync_i32_with_nan(%value: i32, %offset: i32) { + // expected-error@+1 {{nan attribute is supported only for f32 type}} + %res = nvvm.redux.sync add %value, %offset {nan = true}: i32 -> i32 + llvm.return +} + +// ----- + +llvm.func @redux_sync_f32_with_invalid_kind_add(%value: f32, %offset: i32) { + // expected-error@+1 {{'add' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}} + %res = nvvm.redux.sync add %value, %offset: f32 -> f32 + llvm.return +} + +// ----- + +llvm.func @redux_sync_f32_with_invalid_kind_and(%value: f32, %offset: i32) { + // expected-error@+1 {{'and' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}} + %res = nvvm.redux.sync and %value, %offset: f32 -> f32 + llvm.return +} + +// ----- + +llvm.func @redux_sync_i32_with_invalid_kind_fmin(%value: i32, %offset: i32) { + // expected-error@+1 {{'fmin' redux kind unsupported with 'i32' type. Only supported type is 'f32'.}} + %res = nvvm.redux.sync fmin %value, %offset: i32 -> i32 + llvm.return +} + +// ----- + +llvm.func @redux_sync_non_matching_types(%value: i32, %offset: i32) { + // expected-error@+1 {{failed to verify that all of {res, val} have same type}} + %res = nvvm.redux.sync add %value, %offset: i32 -> f32 + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir new file mode 100644 index 0000000000000..f2ccfe71a3f23 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir @@ -0,0 +1,22 @@ +// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s + +// ----- + +func.func @nvvm_invalid_shfl_pred(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) { + // expected-error@+1 {{"return_value_and_is_valid" attribute must be specified when the return type is a struct type}} + %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 : f32 -> !llvm.struct<(f32, i1)> +} + +// ----- + +func.func @nvvm_invalid_shfl_invalid_return_type_1(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) { + // expected-error@+1 {{expected return type to be of type 'f32' but got 'i32' instead}} + %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 : f32 -> i32 +} + +// ----- + +func.func @nvvm_invalid_shfl_invalid_return_type_2(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) { + // expected-error@+1 {{expected first element in the returned struct to be of type 'f32' but got 'i32' instead}} + %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 {return_value_and_is_valid} : f32 -> !llvm.struct<(i32, i1)> +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir new file mode 100644 index 0000000000000..1b93f20c15b99 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir @@ -0,0 +1,9 @@ +// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s + +// ----- + +llvm.func @nvvm_tcgen05_ld_32x32b_offset(%tmemAddr : !llvm.ptr<6>, %offset : i64) -> () { + // expected-error@+1 {{offset argument is only supported for shape 16x32bx2}} + %ldv2 = nvvm.tcgen05.ld %tmemAddr, %offset { pack, shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x i32> + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 9115de65ff0e8..3fc09f371a347 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -538,9 +538,9 @@ llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.p // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %{{.*}}) nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %{{.*}}) - nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3> + nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3> // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %{{.*}}) - nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3> + nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3> llvm.return } diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index 8a848221a50dd..3fbd9e0567948 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -1040,6 +1040,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { llvm.return } +// CHECK-LABEL: rocdl.global.load.async.to.lds +llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { + // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8 + rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b32 + rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b64 + rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b128 + rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3> + llvm.return +} + // CHECK-LABEL: rocdl.tensor.load.to.lds llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>, %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) { diff --git a/mlir/test/Target/SPIRV/group-ops.mlir b/mlir/test/Target/SPIRV/group-ops.mlir index cf519cba961c5..6f19b3553dd37 100644 --- a/mlir/test/Target/SPIRV/group-ops.mlir +++ b/mlir/test/Target/SPIRV/group-ops.mlir @@ -1,11 +1,13 @@ -// RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s +// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file %s | FileCheck %s // RUN: %if spirv-tools %{ rm -rf %t %} // RUN: %if spirv-tools %{ mkdir %t %} // RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %} // RUN: %if spirv-tools %{ spirv-val %t %} -spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, SubgroupBallotKHR, Groups, SubgroupBufferBlockIOINTEL, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_shader_ballot, SPV_INTEL_subgroups, SPV_KHR_uniform_group_instructions]> { +spirv.module Logical GLSL450 requires #spirv.vce<v1.3, + [Shader, Linkage, SubgroupBallotKHR, Groups, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], + [SPV_KHR_storage_buffer_storage_class, SPV_KHR_shader_ballot, SPV_KHR_uniform_group_instructions]> { // CHECK-LABEL: @subgroup_ballot spirv.func @subgroup_ballot(%predicate: i1) -> vector<4xi32> "None" { // CHECK: %{{.*}} = spirv.KHR.SubgroupBallot %{{.*}}: vector<4xi32> @@ -24,30 +26,6 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, Subgrou %0 = spirv.GroupBroadcast <Workgroup> %value, %localid : f32, vector<3xi32> spirv.ReturnValue %0: f32 } - // CHECK-LABEL: @subgroup_block_read_intel - spirv.func @subgroup_block_read_intel(%ptr : !spirv.ptr<i32, StorageBuffer>) -> i32 "None" { - // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> i32 - %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> i32 - spirv.ReturnValue %0: i32 - } - // CHECK-LABEL: @subgroup_block_read_intel_vector - spirv.func @subgroup_block_read_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>) -> vector<3xi32> "None" { - // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32> - %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32> - spirv.ReturnValue %0: vector<3xi32> - } - // CHECK-LABEL: @subgroup_block_write_intel - spirv.func @subgroup_block_write_intel(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: i32) -> () "None" { - // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : i32 - spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : i32 - spirv.Return - } - // CHECK-LABEL: @subgroup_block_write_intel_vector - spirv.func @subgroup_block_write_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: vector<3xi32>) -> () "None" { - // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : vector<3xi32> - spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : vector<3xi32> - spirv.Return - } // CHECK-LABEL: @group_iadd spirv.func @group_iadd(%value: i32) -> i32 "None" { // CHECK: spirv.GroupIAdd <Workgroup> <Reduce> %{{.*}} : i32 diff --git a/mlir/test/Target/SPIRV/subgroup-block-intel.mlir b/mlir/test/Target/SPIRV/subgroup-block-intel.mlir new file mode 100644 index 0000000000000..14060e632fffd --- /dev/null +++ b/mlir/test/Target/SPIRV/subgroup-block-intel.mlir @@ -0,0 +1,34 @@ +// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s + +// RUN: %if spirv-tools %{ rm -rf %t %} +// RUN: %if spirv-tools %{ mkdir %t %} +// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --spirv-save-validation-files-with-prefix=%t/module %s %} +// RUN: %if spirv-tools %{ spirv-val %t %} + +spirv.module Physical64 GLSL450 requires #spirv.vce<v1.3, [Addresses, Shader, Linkage, SubgroupBufferBlockIOINTEL], + [SPV_KHR_storage_buffer_storage_class, SPV_INTEL_subgroups]> { + // CHECK-LABEL: @subgroup_block_read_intel + spirv.func @subgroup_block_read_intel(%ptr : !spirv.ptr<i32, StorageBuffer>) -> i32 "None" { + // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> i32 + %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> i32 + spirv.ReturnValue %0: i32 + } + // CHECK-LABEL: @subgroup_block_read_intel_vector + spirv.func @subgroup_block_read_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>) -> vector<3xi32> "None" { + // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32> + %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32> + spirv.ReturnValue %0: vector<3xi32> + } + // CHECK-LABEL: @subgroup_block_write_intel + spirv.func @subgroup_block_write_intel(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: i32) -> () "None" { + // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : i32 + spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : i32 + spirv.Return + } + // CHECK-LABEL: @subgroup_block_write_intel_vector + spirv.func @subgroup_block_write_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: vector<3xi32>) -> () "None" { + // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : vector<3xi32> + spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : vector<3xi32> + spirv.Return + } +} diff --git a/mlir/test/Transforms/test-legalizer-full.mlir b/mlir/test/Transforms/test-legalizer-full.mlir index 42cec68b9fbbb..8da9109a32762 100644 --- a/mlir/test/Transforms/test-legalizer-full.mlir +++ b/mlir/test/Transforms/test-legalizer-full.mlir @@ -72,3 +72,21 @@ builtin.module { } } + +// ----- + +// The region of "test.post_order_legalization" is converted before the op. + +// expected-remark@+1 {{applyFullConversion failed}} +builtin.module { +func.func @test_preorder_legalization() { + // expected-error@+1 {{failed to legalize operation 'test.post_order_legalization'}} + "test.post_order_legalization"() ({ + ^bb0(%arg0: i64): + // Not-explicitly-legal ops are not allowed to survive. + "test.remaining_consumer"(%arg0) : (i64) -> () + "test.invalid"(%arg0) : (i64) -> () + }) : () -> () + return +} +} diff --git a/mlir/test/Transforms/test-legalizer-no-materializations.mlir b/mlir/test/Transforms/test-legalizer-no-materializations.mlir new file mode 100644 index 0000000000000..82dd7422b22b2 --- /dev/null +++ b/mlir/test/Transforms/test-legalizer-no-materializations.mlir @@ -0,0 +1,67 @@ +// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0 build-materializations=0 attach-debug-materialization-kind=1" -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-KIND + +// CHECK-LABEL: func @dropped_input_in_use +// CHECK-KIND-LABEL: func @dropped_input_in_use +func.func @dropped_input_in_use(%arg: i16, %arg2: i64) { + // CHECK-NEXT: %[[cast:.*]] = "test.cast"() : () -> i16 + // CHECK-NEXT: "work"(%[[cast]]) : (i16) + // CHECK-KIND-NEXT: %[[cast:.*]] = builtin.unrealized_conversion_cast to i16 {__kind__ = "source"} + // CHECK-KIND-NEXT: "work"(%[[cast]]) : (i16) + // expected-remark@+1 {{op 'work' is not legalizable}} + "work"(%arg) : (i16) -> () +} + +// ----- + +// CHECK-KIND-LABEL: func @test_lookup_without_converter +// CHECK-KIND: %[[producer:.*]] = "test.valid_producer"() : () -> i16 +// CHECK-KIND: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[producer]] : i16 to f64 {__kind__ = "target"} +// CHECK-KIND: "test.valid_consumer"(%[[cast]]) : (f64) -> () +// CHECK-KIND: "test.valid_consumer"(%[[producer]]) : (i16) -> () +func.func @test_lookup_without_converter() { + %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64) + "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> () + // Make sure that the second "replace_with_valid_consumer" lowering does not + // lookup the materialization that was created for the above op. + "test.replace_with_valid_consumer"(%0) : (i64) -> () + // expected-remark@+1 {{op 'func.return' is not legalizable}} + return +} + +// ----- + +// CHECK-LABEL: func @remap_moved_region_args +func.func @remap_moved_region_args() { + // CHECK-NEXT: return + // CHECK-NEXT: ^bb1(%[[arg0:.*]]: i64, %[[arg1:.*]]: i16, %[[arg2:.*]]: i64, %[[arg3:.*]]: f32): + // CHECK-NEXT: %[[cast1:.*]]:2 = builtin.unrealized_conversion_cast %[[arg3]] : f32 to f16, f16 + // CHECK-NEXT: %[[cast2:.*]] = builtin.unrealized_conversion_cast %[[arg2]] : i64 to f64 + // CHECK-NEXT: %[[cast3:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to f64 + // CHECK-NEXT: %[[cast4:.*]] = "test.cast"(%[[cast1]]#0, %[[cast1]]#1) : (f16, f16) -> f32 + // CHECK-NEXT: "test.valid"(%[[cast3]], %[[cast2]], %[[cast4]]) : (f64, f64, f32) + "test.region"() ({ + ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32): + "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> () + }) : () -> () + // expected-remark@+1 {{op 'func.return' is not legalizable}} + return +} + +// ----- + +// CHECK-LABEL: func @remap_cloned_region_args +func.func @remap_cloned_region_args() { + // CHECK-NEXT: return + // CHECK-NEXT: ^bb1(%[[arg0:.*]]: i64, %[[arg1:.*]]: i16, %[[arg2:.*]]: i64, %[[arg3:.*]]: f32): + // CHECK-NEXT: %[[cast1:.*]]:2 = builtin.unrealized_conversion_cast %[[arg3]] : f32 to f16, f16 + // CHECK-NEXT: %[[cast2:.*]] = builtin.unrealized_conversion_cast %[[arg2]] : i64 to f64 + // CHECK-NEXT: %[[cast3:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to f64 + // CHECK-NEXT: %[[cast4:.*]] = "test.cast"(%[[cast1]]#0, %[[cast1]]#1) : (f16, f16) -> f32 + // CHECK-NEXT: "test.valid"(%[[cast3]], %[[cast2]], %[[cast4]]) : (f64, f64, f32) + "test.region"() ({ + ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32): + "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> () + }) {legalizer.should_clone} : () -> () + // expected-remark@+1 {{op 'func.return' is not legalizable}} + return +} diff --git a/mlir/test/Transforms/test-legalizer-rollback.mlir b/mlir/test/Transforms/test-legalizer-rollback.mlir index 71e11782e14b0..4bcca6b7e5228 100644 --- a/mlir/test/Transforms/test-legalizer-rollback.mlir +++ b/mlir/test/Transforms/test-legalizer-rollback.mlir @@ -163,3 +163,22 @@ func.func @create_unregistered_op_in_pattern() -> i32 { "test.return"(%0) : (i32) -> () } } + +// ----- + +// CHECK-LABEL: func @test_failed_preorder_legalization +// CHECK: "test.post_order_legalization"() ({ +// CHECK: %[[r:.*]] = "test.illegal_op_g"() : () -> i32 +// CHECK: "test.return"(%[[r]]) : (i32) -> () +// CHECK: }) : () -> () +// expected-remark @+1 {{applyPartialConversion failed}} +module { +func.func @test_failed_preorder_legalization() { + // expected-error @+1 {{failed to legalize operation 'test.post_order_legalization' that was explicitly marked illegal}} + "test.post_order_legalization"() ({ + %0 = "test.illegal_op_g"() : () -> (i32) + "test.return"(%0) : (i32) -> () + }) : () -> () + return +} +} diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir index 94c5bb4e93b06..88a71cc26ab0c 100644 --- a/mlir/test/Transforms/test-legalizer.mlir +++ b/mlir/test/Transforms/test-legalizer.mlir @@ -1,7 +1,6 @@ // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics %s | FileCheck %s // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics -profile-actions-to=- %s | FileCheck %s --check-prefix=CHECK-PROFILER // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0" -verify-diagnostics %s | FileCheck %s -// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0 build-materializations=0 attach-debug-materialization-kind=1" -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-KIND // CHECK-PROFILER: "name": "pass-execution", "cat": "PERF", "ph": "B" // CHECK-PROFILER: "name": "apply-conversion", "cat": "PERF", "ph": "B" @@ -146,36 +145,6 @@ func.func @no_remap_nested() { // ----- -// CHECK-LABEL: func @remap_moved_region_args -func.func @remap_moved_region_args() { - // CHECK-NEXT: return - // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16): - // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32 - // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32) - "test.region"() ({ - ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32): - "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> () - }) : () -> () - // expected-remark@+1 {{op 'func.return' is not legalizable}} - return -} - -// ----- - -// CHECK-LABEL: func @remap_cloned_region_args -func.func @remap_cloned_region_args() { - // CHECK-NEXT: return - // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16): - // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32 - // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32) - "test.region"() ({ - ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32): - "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> () - }) {legalizer.should_clone} : () -> () - // expected-remark@+1 {{op 'func.return' is not legalizable}} - return -} - // CHECK-LABEL: func @remap_drop_region func.func @remap_drop_region() { // CHECK-NEXT: return @@ -191,12 +160,9 @@ func.func @remap_drop_region() { // ----- // CHECK-LABEL: func @dropped_input_in_use -// CHECK-KIND-LABEL: func @dropped_input_in_use func.func @dropped_input_in_use(%arg: i16, %arg2: i64) { // CHECK-NEXT: %[[cast:.*]] = "test.cast"() : () -> i16 // CHECK-NEXT: "work"(%[[cast]]) : (i16) - // CHECK-KIND-NEXT: %[[cast:.*]] = builtin.unrealized_conversion_cast to i16 {__kind__ = "source"} - // CHECK-KIND-NEXT: "work"(%[[cast]]) : (i16) // expected-remark@+1 {{op 'work' is not legalizable}} "work"(%arg) : (i16) -> () } @@ -452,11 +418,6 @@ func.func @test_multiple_1_to_n_replacement() { // CHECK: %[[cast:.*]] = "test.cast"(%[[producer]]) : (i16) -> f64 // CHECK: "test.valid_consumer"(%[[cast]]) : (f64) -> () // CHECK: "test.valid_consumer"(%[[producer]]) : (i16) -> () -// CHECK-KIND-LABEL: func @test_lookup_without_converter -// CHECK-KIND: %[[producer:.*]] = "test.valid_producer"() : () -> i16 -// CHECK-KIND: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[producer]] : i16 to f64 {__kind__ = "target"} -// CHECK-KIND: "test.valid_consumer"(%[[cast]]) : (f64) -> () -// CHECK-KIND: "test.valid_consumer"(%[[producer]]) : (i16) -> () func.func @test_lookup_without_converter() { %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64) "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> () @@ -487,3 +448,35 @@ func.func @test_working_1to1_pattern(%arg0: f16) { "test.type_consumer"(%arg0) : (f16) -> () "test.return"() : () -> () } + +// ----- + +// The region of "test.post_order_legalization" is converted before the op. + +// CHECK: notifyBlockInserted into test.post_order_legalization: was unlinked +// CHECK: notifyOperationInserted: test.invalid +// CHECK: notifyBlockErased +// CHECK: notifyOperationInserted: test.valid, was unlinked +// CHECK: notifyOperationReplaced: test.invalid +// CHECK: notifyOperationErased: test.invalid +// CHECK: notifyOperationModified: test.post_order_legalization + +// CHECK-LABEL: func @test_preorder_legalization +// CHECK: "test.post_order_legalization"() ({ +// CHECK: ^{{.*}}(%[[arg0:.*]]: f64): +// Note: The survival of a not-explicitly-invalid operation does *not* cause +// a conversion failure in when applying a partial conversion. +// CHECK: %[[cast:.*]] = "test.cast"(%[[arg0]]) : (f64) -> i64 +// CHECK: "test.remaining_consumer"(%[[cast]]) : (i64) -> () +// CHECK: "test.valid"(%[[arg0]]) : (f64) -> () +// CHECK: }) {is_legal} : () -> () +func.func @test_preorder_legalization() { + "test.post_order_legalization"() ({ + ^bb0(%arg0: i64): + // expected-remark @+1 {{'test.remaining_consumer' is not legalizable}} + "test.remaining_consumer"(%arg0) : (i64) -> () + "test.invalid"(%arg0) : (i64) -> () + }) : () -> () + // expected-remark @+1 {{'func.return' is not legalizable}} + return +} diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index fd2b943ff1296..9b64bc691588d 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -1418,6 +1418,22 @@ class TestTypeConsumerOpPattern } }; +class TestPostOrderLegalization : public ConversionPattern { +public: + TestPostOrderLegalization(MLIRContext *ctx, const TypeConverter &converter) + : ConversionPattern(converter, "test.post_order_legalization", 1, ctx) {} + LogicalResult + matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands, + ConversionPatternRewriter &rewriter) const final { + for (Region &r : op->getRegions()) + if (failed(rewriter.legalize(&r))) + return failure(); + rewriter.modifyOpInPlace( + op, [&]() { op->setAttr("is_legal", rewriter.getUnitAttr()); }); + return success(); + } +}; + /// Test unambiguous overload resolution of replaceOpWithMultiple. This /// function is just to trigger compiler errors. It is never executed. [[maybe_unused]] void testReplaceOpWithMultipleOverloads( @@ -1532,7 +1548,8 @@ struct TestLegalizePatternDriver patterns.add<TestDropOpSignatureConversion, TestDropAndReplaceInvalidOp, TestPassthroughInvalidOp, TestMultiple1ToNReplacement, TestValueReplace, TestReplaceWithValidConsumer, - TestTypeConsumerOpPattern>(&getContext(), converter); + TestTypeConsumerOpPattern, TestPostOrderLegalization>( + &getContext(), converter); patterns.add<TestConvertBlockArgs>(converter, &getContext()); mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns, converter); @@ -1553,14 +1570,16 @@ struct TestLegalizePatternDriver [](Type type) { return type.isF32(); }); }); target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) { - return converter.isSignatureLegal(op.getFunctionType()) && - converter.isLegal(&op.getBody()); + return converter.isSignatureLegal(op.getFunctionType()); }); target.addDynamicallyLegalOp<func::CallOp>( [&](func::CallOp op) { return converter.isLegal(op); }); target.addDynamicallyLegalOp( OperationName("test.value_replace", &getContext()), [](Operation *op) { return op->hasAttr("is_legal"); }); + target.addDynamicallyLegalOp( + OperationName("test.post_order_legalization", &getContext()), + [](Operation *op) { return op->hasAttr("is_legal"); }); // TestCreateUnregisteredOp creates `arith.constant` operation, // which was not added to target intentionally to test @@ -2156,8 +2175,7 @@ struct TestTypeConversionDriver recursiveType.getName() == "outer_converted_type"); }); target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) { - return converter.isSignatureLegal(op.getFunctionType()) && - converter.isLegal(&op.getBody()); + return converter.isSignatureLegal(op.getFunctionType()); }); target.addDynamicallyLegalOp<TestCastOp>([&](TestCastOp op) { // Allow casts from F64 to F32. diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td index ea20597231d58..9859bd06cb526 100644 --- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td +++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td @@ -470,4 +470,11 @@ def TestMemrefType : Test_Type<"TestMemref", }]; } +// Test implementation of an interface with methods specifying a +// method body +def TestBaseBody : Test_Type<"TestBaseBody", + [DeclareTypeInterfaceMethods<TestBaseTypeInterfacePrintTypeA>]> { + let mnemonic = "test_base_body"; +} + #endif // TEST_TYPEDEFS diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index 76d461108b296..93d51441f5b81 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -200,7 +200,8 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> { Value sgId = gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr); - auto maybeOffsets = sliceAttr.getOffsets(rewriter, loc, sgId, wgShape); + auto maybeOffsets = + sliceAttr.computeDistributedCoords(rewriter, loc, sgId, wgShape); if (failed(maybeOffsets)) return failure(); diff --git a/mlir/test/mlir-tblgen/cpp-class-comments.td b/mlir/test/mlir-tblgen/cpp-class-comments.td index 9dcf975e45286..0d3445d6647af 100644 --- a/mlir/test/mlir-tblgen/cpp-class-comments.td +++ b/mlir/test/mlir-tblgen/cpp-class-comments.td @@ -36,6 +36,7 @@ def A_SomeOp1 : Op<A_Dialect, "some_op1", []>{ let cppNamespace = "OP1"; // OP: namespace OP1 +// OP-EMPTY: // OP-NEXT: /// Some Op1 summary line1 // OP-NEXT: /// summary line2 // OP-NEXT: /// Some Op1 description @@ -97,6 +98,7 @@ def EncodingTrait : AttrInterface<"EncodingTrait"> { let methods = [ ]; // ATTR-INTERFACE: namespace mlir::a::traits { +// ATTR-INTERFACE-EMPTY: // ATTR-INTERFACE-NEXT: /// Common trait for all layouts. // ATTR-INTERFACE-NEXT: class EncodingTrait; } @@ -104,6 +106,7 @@ def EncodingTrait : AttrInterface<"EncodingTrait"> { def SimpleEncodingTrait : AttrInterface<"SimpleEncodingTrait"> { let cppNamespace = "a::traits"; // ATTR-INTERFACE: namespace a::traits { +// ATTR-INTERFACE-EMPTY: // ATTR-INTERFACE-NEXT: class SimpleEncodingTrait; } @@ -114,6 +117,7 @@ def SimpleOpInterface : OpInterface<"SimpleOpInterface"> { Simple Op Interface description }]; // OP-INTERFACE: namespace a::traits { +// OP-INTERFACE-EMPTY: // OP-INTERFACE-NEXT: /// Simple Op Interface description // OP-INTERFACE-NEXT: class SimpleOpInterface; } diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py index d6b70dc9d1978..e58b7646316fc 100644 --- a/mlir/test/python/dialects/transform_structured_ext.py +++ b/mlir/test/python/dialects/transform_structured_ext.py @@ -627,12 +627,16 @@ def testVectorizeChildrenAndApplyPatternsAllAttrs(target): disable_transfer_permutation_map_lowering_patterns=True, vectorize_nd_extract=True, vectorize_padding=True, + flatten_1d_depthwise_conv=True, + fold_type_extensions_into_contract=True, ) # CHECK-LABEL: TEST: testVectorizeChildrenAndApplyPatternsAllAttrs # CHECK: transform.sequence # CHECK: = transform.structured.vectorize # CHECK-SAME: disable_multi_reduction_to_contract_patterns # CHECK-SAME: disable_transfer_permutation_map_lowering_patterns + # CHECK-SAME: flatten_1d_depthwise_conv + # CHECK-SAME: fold_type_extensions_into_contract # CHECK-SAME: vectorize_nd_extract # CHECK-SAME: vectorize_padding @@ -646,12 +650,16 @@ def testVectorizeChildrenAndApplyPatternsNoAttrs(target): disable_transfer_permutation_map_lowering_patterns=False, vectorize_nd_extract=False, vectorize_padding=False, + flatten_1d_depthwise_conv=False, + fold_type_extensions_into_contract=False, ) # CHECK-LABEL: TEST: testVectorizeChildrenAndApplyPatternsNoAttrs # CHECK: transform.sequence # CHECK: = transform.structured.vectorize # CHECK-NOT: disable_multi_reduction_to_contract_patterns # CHECK-NOT: disable_transfer_permutation_map_lowering_patterns + # CHECK-NOT: flatten_1d_depthwise_conv + # CHECK-NOT: fold_type_extensions_into_contract # CHECK-NOT: vectorize_nd_extract # CHECK-NOT: vectorize_padding diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp index 8ec2e03095423..2a513c3b8cc9b 100644 --- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp +++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp @@ -637,8 +637,10 @@ void DefGen::emitTraitMethods(const InterfaceTrait &trait) { for (auto &method : iface.getMethods()) { // Don't declare if the method has a body. Or if the method has a default // implementation and the def didn't request that it always be declared. - if (method.getBody() || (method.getDefaultImplementation() && - !alwaysDeclared.count(method.getName()))) { + if (method.getBody()) + continue; + if (method.getDefaultImplementation() && + !alwaysDeclared.count(method.getName())) { genTraitMethodUsingDecl(trait, method); continue; } diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py index 3712a6b9c963d..22774468bb403 100755 --- a/mlir/utils/generate-test-checks.py +++ b/mlir/utils/generate-test-checks.py @@ -230,14 +230,11 @@ def process_line(line_chunks, variable_namer, use_ssa_name=False, strict_name_re # Process the source file lines. The source file doesn't have to be .mlir. -def process_source_lines(source_lines, note, args): +def process_source_lines(source_lines, args): source_split_re = re.compile(args.source_delim_regex) source_segments = [[]] for line in source_lines: - # Remove previous note. - if line in note: - continue # Remove previous CHECK lines. if line.find(args.check_prefix) != -1: continue @@ -359,9 +356,10 @@ def main(): source_segments = None if args.source: - source_segments = process_source_lines( - [l.rstrip() for l in open(args.source, "r")], autogenerated_note, args - ) + with open(args.source, "r") as f: + raw_source = f.read().replace(autogenerated_note, "") + raw_source_lines = [l.rstrip() for l in raw_source.splitlines()] + source_segments = process_source_lines(raw_source_lines, args) if args.inplace: assert args.output is None diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td index 5b54c79d83f9d..e9c154818c4a1 100644 --- a/offload/liboffload/API/Device.td +++ b/offload/liboffload/API/Device.td @@ -29,6 +29,7 @@ def ol_device_info_t : Enum { TaggedEtor<"PLATFORM", "ol_platform_handle_t", "the platform associated with the device">, TaggedEtor<"NAME", "char[]", "Device name">, TaggedEtor<"PRODUCT_NAME", "char[]", "Device user-facing marketing name">, + TaggedEtor<"UID", "char[]", "Device UID">, TaggedEtor<"VENDOR", "char[]", "Device vendor">, TaggedEtor<"DRIVER_VERSION", "char[]", "Driver version">, TaggedEtor<"MAX_WORK_GROUP_SIZE", "uint32_t", "Maximum total work group size in work items">, diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 6d22faeb0e57e..84bc414396811 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -147,8 +147,8 @@ llvm::Error ol_platform_impl_t::init() { if (llvm::Error Err = Plugin->initDevice(Id)) return Err; - auto Device = &Plugin->getDevice(Id); - auto Info = Device->obtainInfoImpl(); + GenericDeviceTy *Device = &Plugin->getDevice(Id); + llvm::Expected<InfoTreeNode> Info = Device->obtainInfo(); if (llvm::Error Err = Info.takeError()) return Err; Devices.emplace_back(std::make_unique<ol_device_impl_t>(Id, Device, *this, @@ -467,6 +467,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, switch (PropName) { case OL_DEVICE_INFO_NAME: case OL_DEVICE_INFO_PRODUCT_NAME: + case OL_DEVICE_INFO_UID: case OL_DEVICE_INFO_VENDOR: case OL_DEVICE_INFO_DRIVER_VERSION: { // String values @@ -544,6 +545,8 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, return Info.writeString("Virtual Host Device"); case OL_DEVICE_INFO_PRODUCT_NAME: return Info.writeString("Virtual Host Device"); + case OL_DEVICE_INFO_UID: + return Info.writeString(GenericPluginTy::getHostDeviceUid()); case OL_DEVICE_INFO_VENDOR: return Info.writeString("Liboffload"); case OL_DEVICE_INFO_DRIVER_VERSION: diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h index 29cfe78082dbb..ddfa65c76cf2d 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h @@ -72,6 +72,7 @@ typedef enum hsa_amd_agent_info_s { HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A, HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B, HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010, + HSA_AMD_AGENT_INFO_UUID = 0xA011, HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016, } hsa_amd_agent_info_t; diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 0b03ef534d273..928c6cd7569e3 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2083,6 +2083,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; ComputeUnitKind = GPUName; + // From the ROCm HSA documentation: + // Query the UUID of the agent. The value is an Ascii string with a maximum + // of 21 chars including NUL. The string value consists of two parts: header + // and body. The header identifies the device type (GPU, CPU, DSP) while the + // body encodes the UUID as a 16 digit hex string. + // + // Agents that do not support UUID will return the string "GPU-XX" or + // "CPU-XX" or "DSP-XX" depending on their device type. + char UUID[24] = {0}; + if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_UUID, UUID)) + return Err; + if (!StringRef(UUID).ends_with("-XX")) + setDeviceUidFromVendorUid(UUID); + // Get the wavefront size. uint32_t WavefrontSize = 0; if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize)) diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index f9bff9abd903c..f9dcdea7213fd 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -791,6 +791,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// this id is not unique between different plugins; they may overlap. int32_t getDeviceId() const { return DeviceId; } + /// Get the unique identifier of the device. + const char *getDeviceUid() const { return DeviceUid.c_str(); } + /// Set the context of the device if needed, before calling device-specific /// functions. Plugins may implement this function as a no-op if not needed. virtual Error setContext() = 0; @@ -989,9 +992,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy { Error syncEvent(void *EventPtr); virtual Error syncEventImpl(void *EventPtr) = 0; + /// Obtain information about the device. + Expected<InfoTreeNode> obtainInfo(); + virtual Expected<InfoTreeNode> obtainInfoImpl() = 0; + /// Print information about the device. Error printInfo(); - virtual Expected<InfoTreeNode> obtainInfoImpl() = 0; /// Return true if the device has work that is either queued or currently /// running @@ -1204,6 +1210,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// global device id and is not the device id visible to the OpenMP user. const int32_t DeviceId; + /// The unique identifier of the device. + /// Per default, the unique identifier of the device is set to the device id, + /// combined with the plugin name, since the offload device id may overlap + /// between different plugins. + std::string DeviceUid; + /// Construct the device UID from the vendor (U)UID. + void setDeviceUidFromVendorUid(StringRef VendorUid); + /// The default grid values used for this device. llvm::omp::GV GridValues; @@ -1290,6 +1304,9 @@ struct GenericPluginTy { return UserDeviceIds.at(DeviceId); } + /// Get the UID for the host device. + static constexpr const char *getHostDeviceUid() { return "HOST"; } + /// Get the ELF code to recognize the binary image of this plugin. virtual uint16_t getMagicElfBits() const = 0; diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 36d643b65922d..d7e5a21600abf 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -715,6 +715,10 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, DeviceId(DeviceId), GridValues(OMPGridValues), PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(), PinnedAllocs(*this), RPCServer(nullptr) { + // Conservative fall-back to the plugin's device uid for the case that no real + // vendor (u)uid will become available later. + setDeviceUidFromVendorUid(std::to_string(static_cast<uint64_t>(DeviceId))); + #ifdef OMPT_SUPPORT OmptInitialized.store(false); // Bind the callbacks to this device's member functions @@ -1524,15 +1528,22 @@ Error GenericDeviceTy::enqueueHostCall(void (*Callback)(void *), void *UserData, return Err; } +Expected<InfoTreeNode> GenericDeviceTy::obtainInfo() { + auto InfoOrErr = obtainInfoImpl(); + if (InfoOrErr) + InfoOrErr->add("UID", getDeviceUid(), "", DeviceInfo::UID); + return InfoOrErr; +} + Error GenericDeviceTy::printInfo() { - auto Info = obtainInfoImpl(); + auto InfoOrErr = obtainInfo(); // Get the vendor-specific info entries describing the device properties. - if (auto Err = Info.takeError()) + if (auto Err = InfoOrErr.takeError()) return Err; // Print all info entries. - Info->print(); + InfoOrErr->print(); return Plugin::success(); } @@ -1603,6 +1614,10 @@ Expected<bool> GenericDeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) { return isAccessiblePtrImpl(Ptr, Size); } +void GenericDeviceTy::setDeviceUidFromVendorUid(StringRef VendorUid) { + DeviceUid = std::string(Plugin.getName()) + "-" + std::string(VendorUid); +} + Error GenericPluginTy::init() { if (Initialized) return Plugin::success(); diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index f5b2d074a47e7..e7a1ca38b3c13 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -35,6 +35,7 @@ DLWRAP(cuFuncSetAttribute, 3) // Device info DLWRAP(cuDeviceGetName, 3) +DLWRAP(cuDeviceGetUuid, 2) DLWRAP(cuDeviceTotalMem, 2) DLWRAP(cuDriverGetVersion, 1) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index dec4e33508c62..a470d6df1079d 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -33,6 +33,9 @@ typedef struct CUfunc_st *CUfunction; typedef void (*CUhostFn)(void *userData); typedef struct CUstream_st *CUstream; typedef struct CUevent_st *CUevent; +typedef struct CUuuid_st { + char bytes[16]; +} CUuuid; #define CU_DEVICE_INVALID ((CUdevice)(-2)) @@ -301,6 +304,7 @@ CUresult cuFuncSetAttribute(CUfunction, CUfunction_attribute, int); // Device info CUresult cuDeviceGetName(char *, int, CUdevice); +CUresult cuDeviceGetUuid(CUuuid *, CUdevice); CUresult cuDeviceTotalMem(size_t *, CUdevice); CUresult cuDriverGetVersion(int *); diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index db94f7f2dd995..a9adcc397fb7b 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -25,6 +25,7 @@ #include "PluginInterface.h" #include "Utils/ELF.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" @@ -293,6 +294,12 @@ struct CUDADeviceTy : public GenericDeviceTy { if (auto Err = Plugin::check(Res, "error in cuDeviceGet: %s")) return Err; + CUuuid UUID = {0}; + Res = cuDeviceGetUuid(&UUID, Device); + if (auto Err = Plugin::check(Res, "error in cuDeviceGetUuid: %s")) + return Err; + setDeviceUidFromVendorUid(toHex(UUID.bytes, true)); + // Query the current flags of the primary context and set its flags if // it is inactive. unsigned int FormerPrimaryCtxFlags = 0; diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp index 9b58d67f017ca..42ffb97d6d77c 100644 --- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp +++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp @@ -176,6 +176,7 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) { printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_NAME, "Name")); OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_PRODUCT_NAME, "Product Name")); + OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_UID, "UID")); OFFLOAD_ERR( printDeviceValue<ol_device_type_t>(S, D, OL_DEVICE_INFO_TYPE, "Type")); OFFLOAD_ERR(printDeviceValue<const char *>( diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp index 8cb0b8065c33e..30eafee026316 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp @@ -98,6 +98,16 @@ TEST_P(olGetDeviceInfoTest, SuccessProductName) { ASSERT_EQ(std::strlen(Name.data()), Size - 1); } +TEST_P(olGetDeviceInfoTest, SuccessUID) { + size_t Size = 0; + ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_UID, &Size)); + ASSERT_GT(Size, 0ul); + std::vector<char> UID; + UID.resize(Size); + ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_UID, Size, UID.data())); + ASSERT_EQ(std::strlen(UID.data()), Size - 1); +} + TEST_P(olGetDeviceInfoTest, HostProductName) { size_t Size = 0; ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_PRODUCT_NAME, &Size)); @@ -109,6 +119,16 @@ TEST_P(olGetDeviceInfoTest, HostProductName) { ASSERT_EQ(std::strlen(Name.data()), Size - 1); } +TEST_P(olGetDeviceInfoTest, HostUID) { + size_t Size = 0; + ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_UID, &Size)); + ASSERT_GT(Size, 0ul); + std::vector<char> UID; + UID.resize(Size); + ASSERT_SUCCESS(olGetDeviceInfo(Host, OL_DEVICE_INFO_UID, Size, UID.data())); + ASSERT_EQ(std::strlen(UID.data()), Size - 1); +} + TEST_P(olGetDeviceInfoTest, SuccessVendor) { size_t Size = 0; ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size)); diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp index c4a3c2d5e3c75..79a18c1d133dc 100644 --- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp +++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp @@ -32,6 +32,7 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(Platform, ol_platform_handle_t, OL_DEVICE_INFO_PLATFORM); OL_DEVICE_INFO_SIZE_TEST_NONZERO(Name, OL_DEVICE_INFO_NAME); OL_DEVICE_INFO_SIZE_TEST_NONZERO(ProductName, OL_DEVICE_INFO_PRODUCT_NAME); +OL_DEVICE_INFO_SIZE_TEST_NONZERO(UID, OL_DEVICE_INFO_UID); OL_DEVICE_INFO_SIZE_TEST_NONZERO(Vendor, OL_DEVICE_INFO_VENDOR); OL_DEVICE_INFO_SIZE_TEST_NONZERO(DriverVersion, OL_DEVICE_INFO_DRIVER_VERSION); OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkGroupSize, uint32_t, diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt index 6ac047a833fe5..5dd7f4b33612d 100644 --- a/openmp/runtime/src/CMakeLists.txt +++ b/openmp/runtime/src/CMakeLists.txt @@ -254,23 +254,35 @@ set(LIBOMP_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE) # Add symbolic links to libomp if(NOT WIN32) - add_custom_command(TARGET omp POST_BUILD - COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} - libgomp${LIBOMP_LIBRARY_SUFFIX} - COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} - libiomp5${LIBOMP_LIBRARY_SUFFIX} - WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR} - ) - if(LIBOMP_ENABLE_SHARED) - if(APPLE) - set(VERSIONED_LIBGOMP_NAME libgomp.1${LIBOMP_LIBRARY_SUFFIX}) - else() - set(VERSIONED_LIBGOMP_NAME libgomp${LIBOMP_LIBRARY_SUFFIX}.1) - endif() + if(AIX) + # On AIX, libomp.a is the name for both static and shared objects. + set(LIBOMP_AIX_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) add_custom_command(TARGET omp POST_BUILD - COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} ${VERSIONED_LIBGOMP_NAME} + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${LIBOMP_LIB_NAME}${LIBOMP_AIX_SUFFIX} libgomp${LIBOMP_AIX_SUFFIX} + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${LIBOMP_LIB_NAME}${LIBOMP_AIX_SUFFIX} libiomp5${LIBOMP_AIX_SUFFIX} WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR} ) + else() + add_custom_command(TARGET omp POST_BUILD + COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} + libiomp5${LIBOMP_LIBRARY_SUFFIX} + COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} + libgomp${LIBOMP_LIBRARY_SUFFIX} + WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR} + ) + if(LIBOMP_ENABLE_SHARED) + if(APPLE) + set(VERSIONED_LIBGOMP_NAME libgomp.1${LIBOMP_LIBRARY_SUFFIX}) + else() + set(VERSIONED_LIBGOMP_NAME libgomp${LIBOMP_LIBRARY_SUFFIX}.1) + endif() + add_custom_command(TARGET omp POST_BUILD + COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} ${VERSIONED_LIBGOMP_NAME} + WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR} + ) + endif() endif() endif() diff --git a/polly/lib/Support/GICHelper.cpp b/polly/lib/Support/GICHelper.cpp index 027e0194732f4..948bb6a9b9614 100644 --- a/polly/lib/Support/GICHelper.cpp +++ b/polly/lib/Support/GICHelper.cpp @@ -59,7 +59,7 @@ APInt polly::APIntFromVal(__isl_take isl_val *Val) { Data = (uint64_t *)malloc(NumChunks * ChunkSize); isl_val_get_abs_num_chunks(Val, ChunkSize, Data); int NumBits = CHAR_BIT * ChunkSize * NumChunks; - APInt A(NumBits, NumChunks, Data); + APInt A(NumBits, ArrayRef(Data, NumChunks)); // As isl provides only an interface to obtain data that describes the // absolute value of an isl_val, A at this point always contains a positive diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp index 070700a64a168..0888ebd7a9362 100644 --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -237,6 +237,7 @@ struct OptimizerAdditionalInfoTy { bool Postopts; bool Prevect; bool &DepsChanged; + IslMaxOperationsGuard &MaxOpGuard; }; class ScheduleTreeOptimizer final { @@ -381,6 +382,8 @@ class ScheduleTreeOptimizer final { isl::schedule_node ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, int VectorWidth) { + if (Node.is_null()) + return {}; assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); Node = Node.child(0).child(0); isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation(); @@ -391,6 +394,8 @@ ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node, isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1); Node = Node.parent().parent(); isl::union_set Options = IsolateOption.unite(AtomicOption); + if (Node.is_null()) + return {}; isl::schedule_node_band Result = Node.as<isl::schedule_node_band>().set_ast_build_options(Options); return Result; @@ -411,9 +416,13 @@ struct InsertSimdMarkers final : ScheduleNodeRewriter<InsertSimdMarkers> { isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) { + if (Node.is_null()) + return {}; assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band); auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); + if (Space.is_null()) + return {}; unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set)); assert(DimToVectorize < ScheduleDimensions); @@ -439,12 +448,15 @@ isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand( // Sink the inner loop into the smallest possible statements to make them // represent a single vector instruction if possible. Node = isl::manage(isl_schedule_node_band_sink(Node.release())); + if (Node.is_null()) + return {}; // Add SIMD markers to those vector statements. InsertSimdMarkers SimdMarkerInserter; Node = SimdMarkerInserter.visit(Node); - PrevectOpts++; + if (!Node.is_null()) + PrevectOpts++; return Node.parent(); } @@ -535,6 +547,8 @@ ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) { isl::schedule_node ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) { auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get())); + if (Space.is_null()) + return {}; int Dims = unsignedFromIslSize(Space.dim(isl::dim::set)); for (int i = Dims - 1; i >= 0; i--) @@ -572,9 +586,14 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg, Node = applyTileBandOpt(Node); if (OAI->Prevect) { + IslQuotaScope MaxScope = OAI->MaxOpGuard.enter(); + // FIXME: Prevectorization requirements are different from those checked by // isTileableBandNode. Node = applyPrevectBandOpt(Node); + + if (OAI->MaxOpGuard.hasQuotaExceeded() || Node.is_null()) + return (isl::schedule_node()).release(); } return Node.release(); @@ -771,6 +790,10 @@ static void runIslScheduleOptimizer( return; } + isl_ctx *Ctx = S.getIslCtx().get(); + IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut, + /*AutoEnter=*/false); + // Apply ISL's algorithm only if not overridden by the user. Note that // post-rescheduling optimizations (tiling, pattern-based, prevectorization) // rely on the coincidence/permutable annotations on schedule tree bands that @@ -853,8 +876,6 @@ static void runIslScheduleOptimizer( IslOuterCoincidence = 0; } - isl_ctx *Ctx = S.getIslCtx().get(); - isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence); isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands); isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm); @@ -870,28 +891,20 @@ static void runIslScheduleOptimizer( SC = SC.set_coincidence(Validity); { - IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut); + IslQuotaScope MaxOpScope = MaxOpGuard.enter(); Schedule = SC.compute_schedule(); - - if (MaxOpGuard.hasQuotaExceeded()) - POLLY_DEBUG( - dbgs() << "Schedule optimizer calculation exceeds ISL quota\n"); } isl_options_set_on_error(Ctx, OnErrorStatus); - ScopsRescheduled++; + if (!Schedule.is_null()) + ScopsRescheduled++; POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling")); } walkScheduleTreeForStatistics(Schedule, 1); - // In cases the scheduler is not able to optimize the code, we just do not - // touch the schedule. - if (Schedule.is_null()) - return; - - if (GreedyFusion) { + if (GreedyFusion && !Schedule.is_null()) { isl::union_map Validity = D.getDependences( Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW); Schedule = applyGreedyFusion(Schedule, Validity); @@ -905,14 +918,20 @@ static void runIslScheduleOptimizer( /*PatternOpts=*/!HasUserTransformation && PMBasedOpts, /*Postopts=*/!HasUserTransformation && EnablePostopts, /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE, - DepsChanged}; - if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) { + DepsChanged, + MaxOpGuard}; + if (!Schedule.is_null() && (OAI.PatternOpts || OAI.Postopts || OAI.Prevect)) { Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI); Schedule = hoistExtensionNodes(Schedule); POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations")); walkScheduleTreeForStatistics(Schedule, 2); } + if (MaxOpGuard.hasQuotaExceeded()) { + POLLY_DEBUG(dbgs() << "Schedule optimizer calculation exceeds ISL quota\n"); + return; + } + // Skip profitability check if user transformation(s) have been applied. if (!HasUserTransformation && !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule)) diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp index 3f3630027e6e3..c95c55858f038 100644 --- a/polly/lib/Transform/ScheduleTreeTransform.cpp +++ b/polly/lib/Transform/ScheduleTreeTransform.cpp @@ -972,6 +972,9 @@ BandAttr *polly::getBandAttr(isl::schedule_node MarkOrBand) { } isl::schedule polly::hoistExtensionNodes(isl::schedule Sched) { + if (Sched.is_null()) + return {}; + // If there is no extension node in the first place, return the original // schedule tree. if (!containsExtensionNode(Sched)) @@ -1126,6 +1129,8 @@ isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange, isl::union_set polly::getIsolateOptions(isl::set IsolateDomain, unsigned OutDimsNum) { + if (IsolateDomain.is_null()) + return {}; unsigned Dims = unsignedFromIslSize(IsolateDomain.tuple_dim()); assert(OutDimsNum <= Dims && "The isl::set IsolateDomain is used to describe the range of schedule " diff --git a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll new file mode 100644 index 0000000000000..0bc3c2cf642e8 --- /dev/null +++ b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll @@ -0,0 +1,37 @@ +; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine -passes=polly-opt-isl -polly-debug -disable-output < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +define void @ham(ptr %arg, ptr %arg1, i32 %arg2, i32 %arg3, ptr %arg4, i32 %arg5, i32 %arg6) { +bb: + %getelementptr = getelementptr [7 x float], ptr null, i32 0, i32 %arg3 + br label %bb7 + +bb7: ; preds = %bb11, %bb + %phi = phi i32 [ 0, %bb ], [ %add16, %bb11 ] + br label %bb8 + +bb8: ; preds = %bb8, %bb7 + %phi9 = phi i32 [ 0, %bb7 ], [ %add, %bb8 ] + %getelementptr10 = getelementptr [7 x float], ptr null, i32 0, i32 %phi9 + store float 0.000000e+00, ptr %getelementptr10, align 4 + %add = add i32 %phi9, 1 + %icmp = icmp eq i32 %phi9, 0 + br i1 %icmp, label %bb8, label %bb11 + +bb11: ; preds = %bb8 + %load = load float, ptr %getelementptr, align 4 + store float %load, ptr %arg4, align 4 + %getelementptr12 = getelementptr [7 x float], ptr null, i32 0, i32 %arg5 + %load13 = load float, ptr %getelementptr12, align 4 + store float %load13, ptr %arg, align 4 + %getelementptr14 = getelementptr [7 x float], ptr null, i32 0, i32 %arg6 + %load15 = load float, ptr %getelementptr14, align 4 + store float %load15, ptr %arg1, align 4 + %add16 = add i32 %phi, 1 + %icmp17 = icmp ne i32 %phi, %arg2 + br i1 %icmp17, label %bb7, label %bb18 + +bb18: ; preds = %bb11 + ret void +} +; CHECK:Schedule optimizer calculation exceeds ISL quota diff --git a/runtimes/cmake/Modules/HandleLibC.cmake b/runtimes/cmake/Modules/HandleLibC.cmake index 01da5b260d3d4..f8869512f99d3 100644 --- a/runtimes/cmake/Modules/HandleLibC.cmake +++ b/runtimes/cmake/Modules/HandleLibC.cmake @@ -30,7 +30,9 @@ elseif (RUNTIMES_USE_LIBC STREQUAL "llvm-libc") check_cxx_compiler_flag(-nostdlibinc CXX_SUPPORTS_NOSTDLIBINC_FLAG) if(CXX_SUPPORTS_NOSTDLIBINC_FLAG) target_compile_options(runtimes-libc-headers INTERFACE "-nostdlibinc") - target_compile_options(runtimes-libc-headers INTERFACE "-idirafter${LIBC_KERNEL_HEADERS}") + if(LIBC_KERNEL_HEADERS) + target_compile_options(runtimes-libc-headers INTERFACE "-idirafter${LIBC_KERNEL_HEADERS}") + endif() endif() add_library(runtimes-libc-static INTERFACE) diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index ae9ff2878e03b..3ded8bdf89514 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -80,6 +80,7 @@ cc_library( hdrs = glob(["LanguageRuntime/CPlusPlus/*.h"]), includes = [".."], deps = [ + "//clang:codegen", "//lldb:CoreHeaders", "//lldb:Headers", "//lldb:SymbolHeaders", diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index bcee6c4e243a4..ad013035251f5 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -2551,6 +2551,7 @@ llvm_target_lib_list = [lib for lib in [ "lib/Target/WebAssembly/WebAssemblyGenRegisterInfo.inc": ["-gen-register-info"], "lib/Target/WebAssembly/WebAssemblyGenSubtargetInfo.inc": ["-gen-subtarget"], "lib/Target/WebAssembly/WebAssemblyGenAsmMatcher.inc": ["-gen-asm-matcher"], + "lib/Target/WebAssembly/WebAssemblyGenSDNodeInfo.inc": ["-gen-sd-node-info"], }, }, {
NumberSection Status Issue title Available in Clang?
1[dcl.fct.default] TC1 What if two using-declarations refer to the same function but the declarations introduce different default-arguments? No
2[temp.dep.res] drafting How can dependent names be used in member declarations that appear outside of the class template definition? Not resolved
3[temp.expl.spec] NAD The template compilation model rules render some explicit specialization declarations not visible during instantiation Clang 2.7
4[dcl.link] CD1 Does extern "C" affect the linkage of function names with internal linkage? Clang 2.8
5[dcl.init] CD1 CV-qualifiers and type conversions Clang 3.1
6[class.copy.elision] NAD Should the optimization that allows a class object to alias another object also allow the case of a parameter in an inline function to alias its argument? Yes
7[class.access.base] NAD Can a class with a private virtual base class be derived from? Clang 3.4
8[class.access] CD1 Access to template arguments used in a function return type and in the nested name specifier Duplicate of 45
9[class.access.base] CD1 Clarification of access to base class members Clang 2.8
10[class.access.nest] CD1 Can a nested class access its own class name as a qualified name if it is a private member of the enclosing class? Duplicate of 45
11[namespace.udecl] CD1 How do the keywords typename/template interact with using-declarations? Clang 2.7
12[basic.lookup.argdep] dup Default arguments on different declarations for the same function and the Koenig lookup Superseded by 239
13[dcl.link] NAD extern "C" for Parameters of Function Templates No
14[dcl.link] NAD extern "C" functions and declarations in different namespaces Clang 3.4
15[dcl.fct.default] dup Default arguments for parameters of function templates Clang 2.7
16[class.access.base] CD1 Access to members of indirect private base classes Clang 2.8
17[class.access.base] NAD Footnote 99 should discuss the naming class when describing members that can be accessed from friends Clang 2.7
18[dcl.fct] NAD f(TYPE) where TYPE is void should be allowed Superseded by 577
19[class.protected] NAD Clarify protected member access Clang 3.1
20[class.copy.ctor] TC1 Some clarifications needed for 12.8 para 15 Clang 2.8
21[temp.param] TC1 Can a default argument for a template parameter appear in a friend declaration? Clang 3.4
22[temp.dep.res] TC1 Template parameter with a default argument that refers to itself Superseded by 481
23[temp.func.order] NAD Some questions regarding partial ordering of function templates Clang 2.7
24[temp.expl.spec] TC1 Errors in examples in 14.7.3 N/A
25[except.spec] TC1 Exception specifications and pointers to members Clang 4
26[class.copy.ctor] NAD Copy constructors and default arguments Clang 2.7
27[over.built] NAD Overload ambiguities for builtin ?: prototypes Clang 2.7
28[basic.start.dynamic] CD1 'exit', 'signal' and static object destruction N/A (Library DR)
29[dcl.link] CD1 Linkage of locally declared functions Clang 3.4
30[temp.names] TC1 Valid uses of "::template" Superseded by 468 (C++11 onwards)
31[expr.new] NAD Looking up new/delete Clang 2.8
32[temp] TC1 Clarification of explicit instantiation of non-exported templates N/A
33[basic.lookup.argdep] TC1 Argument dependent lookup and overloaded functions Clang 9
34[temp.inst] NAD Argument dependent lookup and points of instantiation N/A
35[dcl.init] TC1 Definition of default-initialization Duplicate of 178
36[namespace.udecl] CD6 using-declarations in multiple-declaration contexts Clang 2.8
37[except.uncaught] NAD When is uncaught_exception() true? Superseded by 475
38[temp.names] TC1 Explicit template arguments and operator functions Clang 2.7
39[class.member.lookup] CD1 Conflicting ambiguity rules No
40[dcl.meaning] TC1 Syntax of declarator-id N/A
41[basic.lookup.unqual] TC1 Clarification of lookup of names after declarator-id Clang 2.7
42[basic.scope.class] NAD Redefining names from base classes Clang 2.7
43[basic.types] TC1 Copying base classes (PODs) using memcpy N/A
44[temp.expl.spec] CD1 Member specializations Superseded by 727
45[class.access.nest] CD1 Access to nested classes Clang 2.7
46[temp.explicit] NAD Explicit instantiation of member templates Clang 2.7
47[temp.friend] NAD Template friend issues Superseded by 329
48[class.static.data] TC1 Definitions of unused static members Clang 2.7
49[temp.param] TC1 Restriction on non-type, non-value template arguments Clang 2.8
50[basic.def.odr] NAD Converting pointer to incomplete type to same type Clang 2.7
51[over.match.best] TC1 Overloading and user-defined conversions Clang 2.8
52[expr.ref] TC1 Non-static members, member selection and access checking Clang 2.8
53[expr.static.cast] TC1 Lvalue-to-rvalue conversion before certain static_casts Clang 2.7
54[expr.static.cast] CD1 Static_cast from private base to derived class Clang 2.8
55[expr.add] NAD Adding/subtracting pointer and enumeration value Clang 2.7
56[dcl.typedef] TC1 Redeclaring typedefs within classes Clang 2.7
57[class.union] open Empty unions Not resolved
58[class.bit] CD1 Signedness of bit fields of enum type Clang 3.1
59[over.match.copy] TC1 Clarification of overloading and UDC to reference type Clang 2.7
60[over.ics.ref] CD1 Reference binding and valid conversion sequences Clang 2.7
61[over.over] NAD Address of static member function "&p->f" Clang 3.4
62[temp.arg.type] CD1 Unnamed members of classes used as type parameters Clang 2.9
63[temp.inst] CD1 Class instantiation from pointer conversion to void*, null and self Clang 2.7
64[temp.expl.spec] TC1 Partial ordering to disambiguate explicit specialization Clang 2.7
65[dcl.fct.default] TC1 Typo in default argument example N/A
66[dcl.fct.default] NAD Visibility of default args vs overloads added after using-declaration No
67[class.static] TC1 Evaluation of left side of object-expression N/A
68[dcl.type.elab] TC1 Grammar does not allow "friend class A<int>;" Clang 2.8
69[dcl.stc] TC1 Storage class specifiers on template declarations Clang 9
70[temp.deduct.type] CD1 Is an array bound a nondeduced context? Clang 2.7
71[expr] NAD Incorrect cross reference N/A
72[temp] dup Linkage and storage class specifiers for templates Duplicate of 69
73[expr.eq] TC1 Pointer equality Superseded by 1652
74[expr.new] TC1 Enumeration value in direct-new-declarator Clang 2.7
75[class.mem] TC1 In-class initialized members must be const Clang 2.7
76[dcl.type.cv] TC1 Are const volatile variables considered "constant expressions"? Clang 2.7
77[class.friend] CD1 The definition of friend does not allow nested classes to be friends Clang 2.7
78[dcl.init] CD1 Section 8.5 paragraph 9 should state it only applies to non-static objects Superseded by ????
79[new.delete.placement] dup Alignment and placement new N/A
80[class.mem] TC1 Class members with same name as class Clang 2.9
81[diff] NAD Null pointers and C compatibility N/A
82[basic.def.odr] dup Definition of "using" a constant expression Duplicate of 48
83[over.ics.rank] TC1 Overloading and deprecated conversion of string literal Clang 2.7
84[over.best.ics] TC1 Overloading and conversion loophole used by auto_ptr Clang 2.7
85[basic.lookup.elab] TC1 Redeclaration of member class Clang 3.4
86[class.temporary] CD1 Lifetime of temporaries in query expressions Duplicate of 446
87[except.spec] CD1 Exception specifications on function parameters No
88[temp.expl.spec] NAD Specialization of member constant templates Clang 2.8
89[basic.life] TC1 Object lifetime does not account for reference rebinding N/A
90[basic.lookup.argdep] TC1 Should the enclosing class be an "associated class" too? Clang 2.7
91[basic.lookup.argdep] NAD A union's associated types should include the union itself Clang 2.7
92[except.spec] CD4 Should exception-specifications be part of the type system? Clang 4 (C++17 onwards)
93[basic.life] TC1 Missing word in 3.8 basic.life paragraph 2 N/A
94[expr.const] TC1 Inconsistencies in the descriptions of constant expressions Clang 2.7
95[namespace.memdef] NAD Elaborated type specifiers referencing names declared in friend decls Clang 3.3
96[temp.names] C++11 Syntactic disambiguation using the template keyword Superseded by P1787
97[expr.const] NAD Use of bool constants in integral constant expressions Clang 2.7
98[except] TC1 Branching into try block Clang 2.7
99[temp.deduct.call] NAD Partial ordering, references and cv-qualifiers Superseded by 214
100[temp.arg.nontype] TC1 Clarify why string literals are not allowed as template arguments Clang 2.7
101[namespace.udecl] TC1 Redeclaration of extern "C" names via using-declarations Clang 3.5
102[over.match.oper] NAD Operator lookup rules do not work well with parts of the library Clang 2.7
103[namespace.udir] TC1 Is it extended-namespace-definition or extension-namespace-definition ? N/A
104[except.throw] NAD Destroying the exception temp when no handler is found N/A (Library DR)
105[temp] TC1 Meaning of "template function" N/A
106[unknown] CD1 Creating references to references during template deduction/instantiation Superseded by 540
107[dcl.link] NAD Linkage of operator functions Clang 2.7
108[temp.dep.type] TC1 Are classes nested in templates dependent? Clang 2.9
109[namespace.udecl] NAD Allowing ::template in using-declarations Clang 2.8
110[temp] CD6 Can template functions and classes be declared in the same scope? Clang 2.8
111[class.copy.ctor] NAD Copy constructors and cv-qualifiers Duplicate of 535
112[dcl.array] CD1 Array types and cv-qualifiers Clang 3.1
113[expr.call] CD1 Visibility of called function Clang 2.7
114[temp.mem] NAD Virtual overriding by template member function specializations Clang 2.7
115[over.over] CD1 Address of template-id Clang 3.0
116[temp.over.link] TC1 Equivalent and functionally-equivalent function templates Clang 2.7
117[class.temporary] NAD Timing of destruction of temporaries N/A
118[expr.call] CD1 Calls via pointers to virtual member functions Yes
119[basic.life] CD1 Object lifetime and aggregate initialization N/A
120[temp.res] TC1 Nonexistent non-terminal qualified-name N/A
121[temp.res] TC1 Dependent type names with non-dependent nested-name-specifiers Clang 2.7
122[expr.prim.general] CD1 template-ids as unqualified-ids Clang 2.7
123[expr.prim.general] TC1 Bad cross-reference N/A
124[class.temporary] CD1 Lifetime of temporaries in default initialization of class arrays Clang 2.7
125[expr.prim.general] CD1 Ambiguity in friend declaration syntax Clang 2.7
126[except.spec] TC1 Exception specifications and const Partial
127[expr.new] TC1 Ambiguity in description of matching deallocation function Clang 2.9
128[expr.static.cast] TC1 Casting between enum types Clang 2.7
129[intro.execution] CD3 Stability of uninitialized auto variables Duplicate of 616
130[expr.new] NAD Sequence points and new-expressions N/A
131[extendid] TC1 Typo in Lao characters Superseded by P1949
132[basic.link] NAD Local types and linkage No
133[except.spec] dup Exception specifications and checking Duplicate of 87
134[temp] TC1 Template classes and declarator-ids N/A
135[dcl.fct] TC1 Class type in in-class member function definitions Clang 2.7
136[dcl.fct.default] CD1 Default arguments and friend declarations Clang 3.4
137[expr.static.cast] TC1 static_cast of cv void* Clang 2.7
138[namespace.memdef] CD6 Friend declaration name lookup Partial
139[basic.lookup.unqual] CD1 Error in friend lookup example Clang 2.7
140[dcl.fct] CD1 Agreement of parameter declarations Clang 2.7
141[basic.lookup.classref] CD1 Non-member function templates in member access expressions Clang 3.1
142[class.access.base] TC1 Injection-related errors in access example Clang 2.8
143[basic.lookup.argdep] CD1 Friends and Koenig lookup Clang 2.7
144[dcl.type.elab] open Position of friend specifier Not resolved
145[depr.impldec] TC1 Deprecation of prefix ++ Clang 2.7
146[basic.fundamental] open Floating-point zero Not resolved
147[expr.prim.general] TC1 Naming the constructor Clang 2.7
148[class] TC1 POD classes and pointers to members Clang 2.7
149[conv.ptr] TC1 Accessibility and ambiguity N/A
150[temp.arg.template] C++17 Template template parameters and default arguments Clang 19
151[dcl.init] TC1 Terminology of zero-initialization Clang 3.1
152[class.conv.ctor] TC1 explicit copy constructors Clang 2.7
153[over.ics.rank] TC1 Misleading wording (rank of conversion) N/A
154[dcl.stc] NAD Anonymous unions in unnamed namespaces Clang 2.7
155[dcl.init] dup Brace initializer for scalar Duplicate of 632
156[basic.lookup.classref] NAD Name lookup for conversion functions Superseded by 1111
157[dcl.pre] open Omitted typedef declarator Not resolved
158[basic.lval] CD1 Aliasing and qualification conversions Yes
159[dcl.meaning] TC1 Namespace qualification in declarators Clang 3.5
160[dcl.ambig.res] CD1 Missing std:: qualification N/A
161[class.protected] TC1 Access to protected nested type Clang 3.1
162[over.match.call] CD1 (&C::f)() with nonstatic members Clang 19
163[dcl.init.aggr] TC1 Description of subaggregate initializer N/A
164[basic.lookup.argdep] TC1 Overlap between Koenig and normal lookup Clang 2.7
165[namespace.memdef] NAD Definitions of friends and block-scope externs No
166[namespace.memdef] TC1 Friend declarations of template-ids Clang 2.9
167[depr.static] NAD Deprecating static functions Superseded by 1012
168[dcl.link] NAD C linkage for static member functions No
169[namespace.udecl] NAD template-ids in using-declarations Clang 3.4
170[conv.mem] CD7 Pointer-to-member conversions Clang 3.1
171[basic.namespace] TC1 Global namespace scope Clang 3.4
172[dcl.enum] CD1 Unsigned int as underlying type of enum Clang 2.7
173[lex.charset] TC1 Constraints on execution character set Clang 2.7
174[depr.static] NAD Undeprecating global static Superseded by 1012
175[class] CD1 Class name injection and base name access Clang 2.8
176[class] TC1 Name injection and templates Clang 3.1
177[dcl.init] CD1 Lvalues vs rvalues in copy-initialization Clang 2.7
178[dcl.init] TC1 More on value-initialization Clang 3.1
179[expr.add] TC1 Function pointers and subtraction Clang 2.7
180[temp.res] CD1 typename and elaborated types Clang 2.8
181[temp.deduct.type] TC1 Errors in template template-parameter example Clang 2.7
182[temp.expl.spec] NAD Access checking on explicit specializations Clang 14
183[temp.res] TC1 typename in explicit specializations Superseded by 382
184[temp.param] CD1 Default arguments in template template-parameters Clang 2.7
185[class.copy.ctor] TC1 "Named" temporaries and copy elision Clang 2.7
186[temp.local] open Name hiding and template template-parameters Not resolved
187[temp.param] TC1 Scope of template parameter names Superseded by 481
188[expr.comma] TC1 Comma operator and rvalue conversion Clang 2.7
189[lex.operators] open Definition of operator and punctuator Not resolved
190[class.mem] TC1 Layout-compatible POD-struct types Clang 19
191[basic.lookup.unqual] CD6 Name lookup does not handle complex nesting Clang 2.7
192[basic.lookup.unqual] NAD Name lookup in parameters Clang 2.7
193[class.dtor] TC1 Order of destruction of local automatics of destructor Clang 2.7
194[class.ctor] TC1 Identifying constructors Clang 2.7
195[expr.reinterpret.cast] CD1 Converting between function and object pointers Clang 2.7
196[expr.delete] open Arguments to deallocation functions Not resolved
197[temp.dep.candidate] CD1 Issues with two-stage lookup of dependent names Clang 2.7
198[class.local] CD1 Definition of "use" in local and nested classes Clang 2.9
199[class.temporary] CD1 Order of destruction of temporaries Clang 2.8
200[temp.func.order] dup Partial ordering and explicit arguments Duplicate of 214
201[class.temporary] CD1 Order of destruction of temporaries in initializers Clang 2.8
202[over.over] TC1 Use of overloaded function name Clang 3.1
203[expr.unary.op] NAD Type of address-of-member expression Clang 3.0
204[temp] CD1 Exported class templates Superseded by 820
205[temp] drafting Templates and static data members Not resolved
206[temp.nondep] TC1 Semantic constraints on non-dependent names Clang 2.7
207[class.access.base] CD1 using-declarations and protected access Clang 2.7
208[except.throw] CD1 Rethrowing exceptions in nested handlers Unknown
209[class.friend] NADMust friend declaration names be -accessible?Must friend declaration names be accessible? Clang 3.2
210[except.handle] TC1 What is the type matched by an exception handler? Clang 2.7
211[except] NAD Constructors should not be allowed to return normally after an exception Clang 2.7
212[temp.inst] CD4 Implicit instantiation is not described clearly enough Yes
213[temp.dep] TC1 Lookup in dependent base classes Clang 2.7
214[temp.func.order] CD1 Partial ordering of function templates is underspecified Clang 2.7
215[temp.param] CD1 Template parameters are not allowed in nested-name-specifiers Clang 2.9
216[basic.link] CD1 Linkage of nameless class-scope enumeration types No
217[dcl.fct.default] TC1 Default arguments for non-template member functions of class templates Clang 2.7
218[basic.lookup.argdep] CD1 Specification of Koenig lookup Clang 2.7
219[except.terminate] NAD Cannot defend against destructors that throw exceptions N/A
220[basic.stc.dynamic.deallocation] CD1 All deallocation functions should be required not to throw N/A
221[over.assign] CD1 Must compound assignment operators be member functions? Clang 3.6
222[expr] CD1 Sequence points and lvalue-returning operators Duplicate of 637
223[depr] CD3 The meaning of deprecation N/A
224[temp.dep.type] CD1 Definition of dependent names Clang 16
225[basic.lookup.argdep] NAD Koenig lookup and fundamental types Yes
226[temp.param] CD1 Default template arguments for function templates No
227[stmt.select] TC1 How many scopes in an if statement? Clang 2.7
228[temp.names] CD1 Use of template keyword with non-member templates Clang 2.7
229[temp.spec.partial] NAD Partial specialization of function templates Clang 2.9
230[class.abstract] NAD Calls to pure virtual functions Clang 3.0
231[basic.lookup.unqual] NAD Visibility of names after using-directives Clang 2.7
232[expr.unary.op] NAD Is indirection through a null pointer undefined behavior? Duplicate of 2823
233[dcl.init.ref] CD7 References vs pointers in UDC overload resolution Unknown
234[basic.life] NAD Reuse of base class subobjects N/A
235[class.base.init] TC1 Assignment vs initialization N/A
236[expr.const] NAD Explicit temporaries and integral constant expressions Clang 3.2
237[temp.explicit] CD1 Explicit instantiation and base class members Duplicate of 470
238[expr] CD4 Precision and accuracy constraints on floating point Unknown
239[over.call.func] CD1 Footnote 116 and Koenig lookup Clang 2.7
240[conv.lval] CD3 Uninitialized values and undefined behavior Duplicate of 616
241[temp.arg.explicit] TC1 Error in example in 14.8.1 Clang 9
242[expr.cast] CD4 Interpretation of old-style casts Unknown
243[over.ics.user] NAD Weighting of conversion functions in direct-initialization Clang 2.8
244[class.dtor] CD1 Destructor lookup Clang 11
245[basic.lookup.elab] CD1 Name lookup in elaborated-type-specifiers Clang 2.8
246[temp.arg] CD1 Jumps in function-try-block handlers Clang 3.2
247[over.over] NAD Pointer-to-member casts and function overload resolution Clang 2.7
248[extendid] C++11 Identifier characters Superseded by P1949
249[temp.mem.func] TC1 What is a member function template? Clang 2.7
250[over.over] TC1 Address of function template specialization with non-deduced template arguments Clang 2.7
251[basic.fundamental] open How many signed integer types are there? Not resolved
252[class.dtor] CD1 Looking up deallocation functions in virtual destructors Clang 3.1
253[dcl.init] C++17 Why must empty or fully-initialized const objects be initialized? Unknown
254[basic.lookup.elab] CD1 Definitional problems with elaborated-type-specifiers Clang 2.9
255[class.free] CD6 Placement deallocation functions and lookup ambiguity Clang 2.7
256[expr.new] CD1 Overflow in size calculations Duplicate of 624
257[class.base.init] CD2 Abstract base constructors and virtual base initialization Clang 3.4
258[namespace.udecl] CD1 using-declarations and cv-qualifiers Clang 2.8
259[temp.spec] CD1 Restrictions on explicit specialization and instantiation Clang 4
260[over.built] open User-defined conversions and built-in operator= Not resolved
261[basic.def.odr] CD1 When is a deallocation function "used?" No
262[dcl.fct] CD1 Default arguments and ellipsis Clang 2.7
263[class.ctor] CD1 Can a constructor be declared a friend? Clang 3.3
264[temp.arg.explicit] open Unusable template constructors and conversion functions Not resolved
265[expr.delete] dup Destructors, exceptions, and deallocation Duplicate of 353
266[gram] NAD No grammar sentence symbol N/A
267[expr.new] open Alignment requirement for new-expressions Not resolved
268[cpp.rescan] open Macro name suppression in rescanned replacement text Not resolved
269[basic.start.static] NADOrder of initialization of multiply-defined static data members -of class templatesOrder of initialization of multiply-defined static data members of class templates N/A
270[basic.start.static] CD1 Order of initialization of static data members of class templates N/A
271[temp.deduct] CD6 Explicit instantiation and template argument deduction Unknown
272[class.dtor] CD1 Explicit destructor invocation and qualified-ids Clang 2.7
273[class] CD1 POD classes and operator&() Clang 2.7
274[basic.life] CD1 Cv-qualification and char-alias access to out-of-lifetime objects N/A
275[temp.expl.spec] CD1 Explicit instantiation/specialization and using-directives No
276[stmt.jump] CD1 Order of destruction of parameters and temporaries N/A
277[dcl.init] CD1 Zero-initialization of pointers Clang 3.1
278[basic.link] NAD External linkage and nameless entities Unknown
279[basic.link] CD6 Correspondence of "names for linkage purposes" No
280[over.call.object] CD1 Access and surrogate call functions Clang 2.9
281[dcl.fct.spec] CD1 inline specifier in friend declarations No
282[expr.typeid] open Namespace for extended_type_info Not resolved
283[dcl.type.simple] CD1 Template type-parameters are not syntactically type-names Clang 2.7
284[class] CD1 qualified-ids in class declarations No
285[temp.expl.spec] NAD Identifying a function template being specialized Clang 2.7
286[temp.spec.partial] CD1 Incorrect example in partial specialization Clang 2.8
287[temp.point] drafting Order dependencies in template instantiation Not resolved
288[expr.delete] CD1 Misuse of "static type" in describing pointers N/A
289[basic.def.odr] CD1 Incomplete list of contexts requiring a complete type Clang 2.7
290[basic.types] NAD Should memcpy be allowed into a POD with a const member? N/A
291[dcl.init.ref] CD1 Overload resolution needed when binding reference to class rvalue Duplicate of 391
292[expr.new] CD3 Deallocation on exception in new before arguments evaluated Clang 2.9
293[temp.explicit] open Syntax of explicit instantiation/specialization too permissive Not resolved
294[expr.static.cast] NAD Can static_cast drop exception specifications? No
295[dcl.fct] CD1 cv-qualifiers on function types Clang 3.7
296[class.conv.fct] CD1 Can conversion functions be static? Clang 2.7
297[temp.deduct] NAD Which template does an explicit specialization specialize? Unknown
298[class.qual] CD1 T::x when T is cv-qualified Clang 3.1
299[expr.new] CD1 Conversion on array bound expression in new Clang 2.8 (C++11 onwards)
300[temp.deduct.type] CD1 References to functions in template argument deduction Clang 2.7
301[temp.names] CD1 Syntax for template-name Clang 3.5
302[dcl.init] CD1 Value-initialization and generation of default constructor Clang 3.0
303[conv.prom] NAD Integral promotions on bit-fields N/A
304[dcl.init] TC1 Value-initialization of a reference Clang 2.9
305[basic.lookup.classref] CD1 Name lookup in destructor call No
306[class.member.lookup] CD1 Ambiguity by class name injection Duplicate of 39
307[class.cdtor] NAD Initialization of a virtual base class subobject N/A
308[except.handle] NAD Catching exceptions with ambiguous base classes Clang 3.7
309[basic.pre] CD1 Linkage of entities whose names are not simply identifiers, in introduction Duplicate of 485
310[temp.over.link] open Can function templates differing only in parameter cv-qualifiers be overloaded? Not resolved
311[namespace.def] NAD Using qualified name to reopen nested namespace Clang 3.0
312[basic.stc.dynamic.deallocation] CD3 “use” of invalid pointer value not defined Duplicate of 616
313[expr.new] dup Class with single conversion function to integral as array size in new Duplicate of 299 (C++11 onwards)
314[temp.names] C++17 template in base class specifier No
315[class.static.mfct] NAD Is call of static member function through null pointer undefined? N/A
316[temp.local] NAD Injected-class-name of template used as template template parameter Superseded by 1004
317[dcl.fct.spec] CD1 Can a function be declared inline after it has been called? Clang 3.5
318[class.qual] CD1 struct A::A should not name the constructor of A Superseded by 1310
319[basic.link] CD1 Use of names without linkage in declaring entities with linkage No
320[class.temporary] CD1 Question on copy constructor elision example Clang 3.1
321[basic.lookup.argdep] dup Associated classes and namespaces for argument-dependent lookup Duplicate of 557
322[temp.deduct.conv] CD1 Deduction of reference conversions Clang 2.8
323[temp] CD1 Where must export appear? Superseded by 820
324[expr.unary.op] CD1 Can "&" be applied to assignment to bit-field? Clang 3.6
325[dcl.fct.default] open When are default arguments parsed? Not resolved
326[class.ctor] CD1 Wording for definition of trivial constructor Clang 3.1
327[class] CD1 Use of "structure" without definition Duplicate of 538
328[class.mem] CD1 Missing requirement that class member types be complete Clang 2.7
329[temp.friend] CD1 Evaluation of friends of templates Clang 3.5
330[conv.qual] CD4 Qualification conversions and pointers to arrays of pointers Clang 7
331[class.ctor] CD1 Allowed copy constructor signatures Clang 11
332[dcl.fct] CD3 cv-qualified void parameter types Duplicate of 577
333[dcl.ambig.res] NAD Ambiguous use of "declaration" in disambiguation section Clang 2.7
334[temp.dep.expr] NAD Is a comma-expression dependent if its first operand is? Clang 2.7
335[temp] CD1 Allowing export on template members of nontemplate classes Superseded by 820
336[temp.expl.spec] CD1 Explicit specialization examples are still incorrect Clang 2.7
337[temp.deduct] CD1 Attempt to create array of abtract type should cause deduction to fail Clang 2.7
338[basic.link] CD6 Enumerator name with linkage used as class name in other translation unit Duplicate of 1884
339[expr.const] CD1 Overload resolution in operand of sizeof in constant expression Clang 2.8
340[dcl.ambig.res] NAD Unclear wording in disambiguation section Clang 2.7
341[dcl.link] C++11 extern "C" namespace member function versus global variable Superseded by 1708
342[expr.unary] CD3 Terminology: "indirection" versus "dereference" N/A
343[temp.names] C++17 Make template optional in contexts that require a type No
344[class.dtor] CD3 Naming destructors Duplicate of 1435
345[temp.res] CD1 Misleading comment on example in templates chapter Clang 2.7
346[except.spec] NAD Typo in 15.4 N/A
347[class.nest] NAD Use of derived class name in defining base class nested class Clang 2.7
348[basic.stc.dynamic.deallocation] CD1 delete and user-written deallocation functions N/A
349[temp.deduct.conv] CD1 Template argument deduction for conversion functions and qualification conversions No
350[basic.types] open signed char underlying representation for objects Not resolved
351[expr] CD1 Sequence point error: unspecified or undefined? N/A
352[temp.deduct.call] CD1 Nondeduced contexts Clang 2.8
353[expr.delete] CD1 Is deallocation routine called if destructor throws exception in delete? Unknown
354[temp.arg.nontype] CD1 Null as nontype template argument Clang 3.1 (C++11 onwards)
355[class] C++11 Global-scope :: in nested-name-specifier Clang 2.7
356[class.copy.ctor] NAD Wording of behavior of generated copy constructor for scalar members N/A
357[intro.defs] CD1 Definition of signature should include name Clang 2.7
358[dcl.link] NAD Namespaces and extern "C" Clang 2.7
359[class.union] NAD Type definition in anonymous union Clang 3.3
360[class.access.base] CD6 Using-declaration that reduces access Clang 2.8
361[dcl.fct.default] open Forward reference to default argument Not resolved
362[lex.phases] CD1 Order of initialization in instantiation units N/A
363[class.expl.init] NAD Initialization of class from self N/A
364[over.call.func] CD1 Calling overloaded function with static in set, with no object Clang 2.7
365[basic.stc] open Storage duration and temporaries Not resolved
366[expr.const] CD1 String literal allowed in integral constant expression? Clang 2.7
367[expr.const] CD1 throw operator allowed in constant expression? Clang 2.7
368[temp.deduct] CD1 Uses of non-type parameters that should cause deduction to fail Clang 3.6
369[lex.pptoken] open Are new/delete identifiers or preprocessing-op-or-punc? Not resolved
370[cpp.include] CD1 Can #include <...> form be used other than for standard C++ headers? N/A
371[basic.start.static] open Interleaving of constructor calls Not resolved
372[temp.arg] CD1 Is access granted by base class specifiers available in following base class specifiers? No
373[basic.lookup.udir] C++11 Lookup on namespace qualified name in using-directive Clang 5
374[dcl.meaning] CD2 Can explicit specialization outside namespace use qualified name? Clang 7
375[temp.res] dup Confusing example on lookup with typename Duplicate of 345
376[dcl.fct.spec] NAD Class "definition" versus class "declaration" N/A
377[dcl.enum] CD1 Enum whose enumerators will not fit in any integral type Clang 2.7
378[stmt.jump] CD1 Wording that says temporaries are declared Duplicate of 276
379[class] CD1 Change "class declaration" to "class definition" N/A
380[class.member.lookup] open Definition of "ambiguous base class" missing Not resolved
381[basic.lookup.classref] CD1 Incorrect example of base class member lookup Clang 2.7
382[temp.res] CD1 Allow typename outside of templates Clang 2.7 (C++11 onwards)
383[class] CD1 Is a class with a declared but not defined destructor a POD? Clang 2.7
384[basic.lookup.argdep] NAD Argument-dependent lookup and operator functions Clang 2.7
385[class.protected] CD1 How does protected member check of 11.5 interact with using-declarations? Clang 2.8
386[namespace.udecl] CD6 Friend declaration of name brought in by using-declaration No
387[temp.inject] CD1 Errors in example in 14.6.5 Clang 2.8
388[except.handle] CD3 Catching base*& from a throw of derived* Unknown
389[basic.link] CD1 Unnamed types in entities with linkage No
390[class.abstract] CD1 Pure virtual must be defined when implicitly called Clang 3.3
391[dcl.init.ref] CD1 Require direct binding of short-lived references to rvalues Clang 2.8 (C++11 onwards)
392[class.temporary] CD1 Use of full expression lvalue before temporary destruction Clang 2.8
393[dcl.fct] CD4 Pointer to array of unknown bound in template argument list in parameter Clang 2.7
394[cpp.pre] CD1 identifier-list is never defined N/A
395[class.conv.fct] NAD Conversion operator template syntax Clang 3.0
396[dcl.fct.spec] CD1 Misleading note regarding use of auto for disambiguation Clang 3.0
397[dcl.fct.spec] CD1 Same address for string literals from default arguments in inline functions? Superseded by 1823
398[temp.deduct] CD1 Ambiguous wording on naming a type in deduction Clang 2.7
399[class.dtor] CD6 Destructor lookup redux Clang 11
400[namespace.qual] CD1 Using-declarations and the "struct hack" Clang 2.7
401[temp.param] CD1 When is access for template parameter default arguments checked? Clang 2.8
402[temp.func.order] open More on partial ordering of function templates Not resolved
403[basic.lookup.argdep] CD1 Reference to a type as a template-id Clang 2.7
404[basic.life] CD1 Unclear reference to construction with non-trivial constructor N/A
405[basic.lookup.unqual] CD6 Unqualified function name lookup Clang 2.7
406[class.static.data] CD1 Static data member in class with name for linkage purposes Clang 2.9
407[dcl.typedef] C++11 Named class with associated typedef: two names or one? Clang 3.8
408[temp.static] CD2 sizeof applied to unknown-bound array static data member of template Clang 3.4
409[temp.res] CD1 Obsolete paragraph missed by changes for issue 224 Clang 2.7
410[temp.friend] CD1 Paragraph missed in changes for issue 166 No
411[lex.string] CD6 Use of universal-character-name in character versus string literals Unknown
412[dcl.fct.spec] NAD Can a replacement allocation function be inline? Clang 3.4
413[class] CD1 Definition of "empty class" Clang 2.7
414[basic.lookup.classref] CD1 Multiple types found on destructor lookup Duplicate of 305
415[temp.over] CD1 Template deduction does not cause instantiation Clang 2.7
416[over.match.oper] CD1 Class must be complete to allow operator lookup? Clang 2.7
417[class.name] CD1 Using derived-class qualified name in out-of-class nested class definition No
418[over.match.best] CD6 Imperfect wording on error on multiple default arguments on a called function No
419[basic.life] open Can cast to virtual base class be done on partially-constructed object? Not resolved
420[over.ref] CD1 postfixexpression->scalar_type_dtor() inconsistent Clang 9
421[expr.ref] CD1 Is rvalue.field an rvalue? Clang 2.7
422[dcl.typedef] NAD Is a typedef redeclaration allowed with a template type that might be the same? Clang 2.7
423[over.match.oper] NAD Can a conversion be done on the left operand of a compound assignment? Clang 2.7
424[dcl.typedef] CD1 Wording problem with issue 56 resolution on redeclaring typedefs in class scope Clang 2.7
425[over.built] CD1 Set of candidates for overloaded built-in operator with float operand Clang 2.7
426[basic.link] C++17 Identically-named variables, one internally and one externally linked, allowed? Unknown
427[expr.static.cast] CD1 static_cast ambiguity: conversion versus cast to derived Clang 2.7
428[except.throw] CD1 Mention of expression with reference type Clang 2.7
429[expr.new] CD1 Matching deallocation function chosen based on syntax or signature? Clang 2.8 (C++11 onwards)
430[dcl.init.aggr] CD1 Ordering of expression evaluation in initializer list Clang 2.7 (C++11 onwards)
431[temp.names] C++11 Defect in wording in 14.2 Clang 2.8
432[basic.scope.class] CD1 Is injected class name visible in base class specifier list? Clang 3.0
433[basic.scope.pdecl] CD1 Do elaborated type specifiers in templates inject into enclosing namespace scope? Clang 2.7
434[dcl.init.ref] NAD Unclear suppression of standard conversions while binding reference to lvalue Superseded by 2352
435[dcl.pre] NAD Change "declararation or definition" to "declaration" N/A
436[class.bit] CD1 Problem in example in 9.6 paragraph 4 Clang 2.7
437[class.mem] CD1 Is type of class allowed in member function exception specification? Superseded by 1308
438[expr] CD2 Possible flaw in wording for multiple accesses to object between sequence points Clang 2.7
439[expr.static.cast] CD1 Guarantees on casting pointer back to cv-qualified version of original type Clang 2.7
440[temp.arg] NAD Allow implicit pointer-to-member conversion on nontype template argument Unknown
441[basic.start.static] CD1 Ordering of static reference initialization Clang 2.7
442[expr.delete] CD1 Incorrect use of null pointer constant in description of delete operator Superseded by 348
443[class.temporary] CD1 Wording nit in description of lifetime of temporaries N/A
444[class.copy.assign] NAD Overriding and the generated copy assignment operator Clang 2.7
445[class.friend] NAD Wording issue on friend declarations Clang 3.2
446[expr.cond] CD1 Does an lvalue-to-rvalue conversion on the "?" operator produce a temporary? Clang 2.8
447[temp.dep.constexpr] CD1 Is offsetof type-dependent? Clang 2.8
448[temp.local] C++11 Set of template functions in call with dependent explicit argument Clang 2.8
449[intro.defs] NAD Consistency in use of hyphen with names of "non" entities N/A
450[dcl.init.ref] CD1 Binding a reference to const to a cv-qualified array rvalue Clang 3.2
451[expr] CD1 Expressions with invalid results and ill-formedness Clang 2.7
452[class.this] CD1 Wording nit on description of this Clang 2.7
453[dcl.ref] CD7 References may only bind to “valid” objects Unknown
454[class.static.data] CD1 When is a definition of a static data member required? Unknown
455[over.match.best] NAD Partial ordering and non-deduced arguments Unknown
456[conv.ptr] NAD Is initialized const int or const bool variable a null pointer constant? Clang 3.4
457[expr.const] CD1 Wording nit on use of const variables in constant expressions Clang 2.7
458[temp.local] C++11 Hiding of member template parameters by other members Clang 11
459[temp.local] NAD Hiding of template parameters by base class members Unknown
460[namespace.udecl] CD1 Can a using-declaration name a namespace? Clang 2.7
461[dcl.asm] NAD Make asm conditionally-supported N/A
462[class.temporary] CD3 Lifetime of temporaries bound to comma expressions Clang 2.7
463[expr.reinterpret.cast] CD1 reinterpret_cast<T*>(0) N/A
464[class.temporary] CD1 Wording nit on lifetime of temporaries to which references are bound N/A
465[basic.start.static] NAD May constructors of global objects call exit()? N/A
466[expr.pseudo] CD1 cv-qualifiers on pseudo-destructor type Clang 2.8
467[stmt.dcl] NAD Jump past initialization of local static variable Clang 2.7
468[temp.names] CD1 Allow ::template outside of templates Clang 2.7 (C++11 onwards)
469[temp.deduct.type] NAD Const template specializations and reference arguments No
470[temp.explicit] CD1 Instantiation of members of an explicitly-instantiated class template Clang 2.7
471[class.access.base] NAD Conflicting inherited access specifications Clang 2.8
472[class.protected] open Casting across protected inheritance @@ -2887,5528 +3358,6447 @@

C++ defect report implementation status

473[expr.new] NAD Block-scope declarations of allocator functions Unknown
474[basic.link] CD1 Block-scope extern declarations in namespace members Clang 3.4
475[except.uncaught] C++11 When is std::uncaught_exception() true? (take 2) Unknown
476[expr.new] CD5 Determining the buffer size for placement new Unknown
477[dcl.fct.spec] CD1 Can virtual appear in a friend declaration? Clang 3.5
478[dcl.array] NAD May a function parameter be an array of an abstract class type? Clang 2.7
479[except.throw] CD1 Copy elision in exception handling Clang 2.8
480[conv.mem] CD1 Is a base of a virtual base also virtual? Clang 2.7
481[basic.scope] CD2 Scope of template parameters Clang 2.8
482[dcl.meaning] CD3 Qualified declarators in redeclarations Clang 3.5
483[basic.fundamental] CD3 Normative requirements on integral ranges Clang 2.7
484[class.derived] CD1 Can a base-specifier name a cv-qualified class type? Clang 2.8
485[basic.pre] CD1 What is a “name”? Clang 2.7
486[temp.deduct] CD1 Invalid return types and template argument deduction Clang 2.7
487[expr.const] NAD Operator overloading in constant expressions Clang 2.7
488[temp.deduct] CD1 Local types, overload resolution, and template argument deduction Clang 2.9 (C++11 onwards)
489[temp.inst] NAD Must member function templates be instantiated during overload resolution? N/A
490[basic.lookup.unqual] CD2 Name lookup in friend declarations Clang 2.8
491[dcl.init.aggr] CD1 Initializers for empty-class aggregrate members Duplicate of 413
492[expr.typeid] CD1 typeid constness inconsistent with example Clang 2.7
493[temp.deduct.conv] CD2 Type deduction from a bool context Duplicate of 976
494[class.access] CD1 Problems with the resolution of issue 45 Duplicate of 372
495[over.match.best] CD2 Overload resolution with template and non-template conversion functions Clang 3.5
496[basic.types] CD3 Is a volatile-qualified type really a POD? Superseded by 2094
497[expr.mptr.oper] CD1 Missing required initialization in example Superseded by 253
498[dcl.stc] open Storage class specifiers in definitions of class members Not resolved
499[except.throw] CD2 Throwing an array of unknown size Clang 2.7
500[class.friend] CD1 Access in base-specifiers of friend and nested classes Duplicate of 372
501[class.friend] NAD Visibility of friend declarations within the befriending class Clang 2.7
502[temp.dep.type] C++11 Dependency of nested enumerations and enumerators Clang 2.7
503[temp.deduct.call] open Cv-qualified function types in template argument deduction Not resolved
504[dcl.ref] NAD Should use of a variable in its own initializer require a diagnostic? Unknown
505[lex.ccon] CD1 Conditionally-supported behavior for unknown character escapes Clang 2.7
506[expr.call] CD1 Conditionally-supported behavior for non-POD objects passed to ellipsis Clang 2.7
507[over.built] dup Ambiguity assigning class object to built-in type Duplicate of 260
508[dcl.init] C++11 Non-constructed value-initialized objects N/A
509[dcl.init] CD1 Dead code in the specification of default initialization N/A
510[class.init] CD1 Default initialization of POD classes? N/A
511[class.prop] NAD POD-structs with template assignment operators Unknown
512[class.union] NAD Union members with user-declared non-default constructors Clang 3.0
513[intro.object] CD1 Non-class “most-derived” objects N/A
514[basic.lookup.unqual] CD1 Is the initializer for a namespace member in the scope of the namespace? Clang 2.7
515[temp.dep] CD1 Non-dependent references to base class members Superseded by 1017
516[dcl.type.simple] CD1 Use of signed in bit-field declarations N/A
517[temp.spec.partial.general] CD1 Partial specialization following explicit instantiation No
518[dcl.enum] CD1 Trailing comma following enumerator-list Clang 2.7 (C++11 onwards)
519[conv.ptr] CD1 Null pointer preservation in void* conversions Clang 2.7
520[expr.cast] CD1 Old-style casts between incomplete class types N/A
521[basic.stc.dynamic.allocation] CD1 Requirements for exceptions thrown by allocation functions No
522[temp.deduct.call] CD1 Array-to-pointer decay in template argument deduction Clang 2.7
523[basic.stc.dynamic.deallocation] open Can a one-past-the-end pointer be invalidated by deleting an adjacent object? Not resolved
524[temp.dep] CD1 Can function-notation calls to operator functions be dependent? Clang 2.7
525[temp.inst] CD1 Missing * in example Clang 2.7
526[temp.deduct.type] CD1 Confusing aspects in the specification of non-deduced contexts Clang 2.7
527[basic.link] CD2 Problems with linkage of types N/A
528[expr.typeid] NAD Why are incomplete class types not allowed with typeid? Clang 2.7
529[temp.expl.spec] open Use of template<> with “explicitly-specialized” class templates Not resolved
530[expr.const] CD1 Nontype template arguments in constant expressions Clang 2.7
531[temp.expl.spec] C++11 Defining members of explicit specializations Partial
532[temp.func.order] C++11 Member/nonmember operator template partial ordering Clang 3.5
533[cpp.include] NAD Special treatment for C-style header names N/A
534[temp] CD1 template-names and operator-function-ids Clang 2.9
535[class.copy.ctor] CD3 Copy construction without a copy constructor Clang 3.1
536[expr.prim.general] CD6 Problems in the description of id-expressions N/A
537[intro.defs] CD1 Definition of “signature” N/A
538[class] CD1Definition and usage -of structure, POD-struct, POD-union, -and POD classDefinition and usage of structure, POD-struct, POD-union, and POD class N/A
539[dcl.type] CD3 Constraints on type-specifier-seq Clang 3.4
540[namespace.def] CD1 Propagation of cv-qualifiers in reference-to-reference collapse Clang 2.7
541[temp.dep.expr] CD2 Dependent function types Clang 2.7
542[class.init] CD2 Value initialization of arrays of POD-structs Clang 3.5
543[dcl.init] CD1 Value initialization and default constructors Clang 3.0
544[temp.dep] NAD Base class lookup in explicit specialization Clang 2.7
545[over.match.oper] open User-defined conversions and built-in operator overload resolution Not resolved
546[temp.explicit] C++11 Explicit instantiation of class template members Clang 2.7
547[dcl.fct] C++11 Partial specialization on member function types Clang 3.2
548[dcl.meaning] dup qualified-ids in declarations Duplicate of 482
549[temp.spec.partial.match] drafting Non-deducible parameters in partial specializations Not resolved
550[dcl.fct] dup Pointer to array of unknown bound in parameter declarations Duplicate of 393
551[temp.explicit] CD1 When is inline permitted in an explicit instantiation? Clang 2.7 (C++11 onwards)
552[temp.names] NAD Use of typename in the type in a non-type parameter-declaration Clang 2.7
553[namespace.memdef] NAD Problems with friend allocation and deallocation functions Clang 2.7
554[basic.scope] CD6 Definition of “declarative region” and “scope” N/A
555[basic.lookup] CD5 Pseudo-destructor name lookup Clang 2.8
556[expr.assign] CD2 Conflicting requirements for acceptable aliasing N/A
557[basic.lookup.argdep] CD1 Does argument-dependent lookup cause template instantiation? Clang 3.1
558[lex.charset] CD1 Excluded characters in universal character names Clang 2.9
559[temp.res] CD1 Editing error in issue 382 resolution Clang 2.7
560[temp.res] NAD Use of the typename keyword in return types Clang 16
561[temp.dep.candidate] CD2 Internal linkage functions in dependent name lookup Clang 2.7
562[class.qual] CD6 qualified-ids in non-expression contexts N/A
563[dcl.link] CD6 Linkage specification for objects Clang 3.3
564[dcl.link] CD2 Agreement of language linkage or linkage-specifications? Clang 2.7
565[namespace.udecl] CD3 Conflict rules for using-declarations naming function templates Clang 2.7
566[conv.fpint] NAD Conversion of negative floating point values to integer type Clang 3.1
567[expr.add] NAD Can size_t and ptrdiff_t be larger than long? N/A
568[class] CD1 Definition of POD is too strict Clang 3.0 (C++11 onwards)
569[dcl.pre] CD2 Spurious semicolons at namespace scope should be allowed Clang 2.7 (C++11 onwards)
570[basic.def.odr] CD2 Are references subject to the ODR? Duplicate of 633
571[basic.link] CD2 References declared const Clang 2.7
572[conv] C++11 Standard conversions for non-built-in types Clang 2.7
573[expr.reinterpret.cast] C++11 Conversions between function pointers and void* No
574[class.copy.assign] NAD Definition of “copy assignment operator” Clang 3.0
575[temp.deduct] C++11 Criteria for deduction failure Clang 2.7
576[dcl.typedef] CD2 Typedefs in function definitions Clang 3.5
577[dcl.fct] CD3 void in an empty parameter list Clang 3.5
578[lex.phases] CD6 Phase 1 replacement of characters with universal-character-names Unknown
579[temp.names] open What is a “nested” > or >>? Not resolved
580[class.access] C++11 Access in template-parameters of member and friend definitions Partial
581[temp.arg.explicit] CD5 Can a templated constructor be explicitly instantiated or specialized? Unknown
582[temp.mem] CD1 Template conversion functions N/A
583[expr.rel] CD3 Relational pointer comparisons against the null pointer constant Clang 4
584[basic.lval] NAD Unions and aliasing N/A
585[class.friend] NAD Friend template template parameters Clang 3.0
586[temp.deduct.type] NAD Default template-arguments and template argument deduction N/A
587[expr.cond] CD2 Lvalue operands of a conditional expression differing only in cv-qualification Clang 3.2
588[temp.dep] CD2 Searching dependent bases of classes local to function templates Clang 2.7
589[dcl.init.ref] CD2 Direct binding of class and array rvalues in reference initialization Clang 2.7
590[temp.dep.type] C++11 Nested classes and the “current instantiation” Clang 2.7
591[temp.dep] CD4 When a dependent base class is the current instantiation Clang 20
592[except.ctor] CD1 Exceptions during construction of local static objects N/A
593[except.handle] NAD Falling off the end of a destructor's function-try-block handler Clang 2.8
594[basic.life] CD1 Coordinating issues 119 and 404 with delegating constructors N/A
595[except.spec] dup Exception specifications in templates instantiated from class bodies Duplicate of 1330
596[except.unexpected] NAD Replacing an exception object Unknown
597[basic.life] CD3 Conversions applied to out-of-lifetime non-POD lvalues N/A
598[basic.lookup.argdep] CD2 Associated namespaces of overloaded functions and function templates Clang 2.7
599[expr.delete] CD2 Deleting a null function pointer Partial
600[class.access] CD6 Does access control apply to members or to names? Clang 2.8
601[cpp.cond] CD2 Type of literals in preprocessing expressions Clang 2.7
602[temp.local] C++11 When is the injected-class-name of a class template a template? Clang 2.7
603[temp.type] CD1 Type equivalence and unsigned overflow Clang 3.1
604[over.match.ctor] CD2 Argument list for overload resolution in copy-initialization N/A
605[temp.expl.spec] C++11 Linkage of explicit specializations Clang 2.7
606[temp.deduct.call] CD1 Template argument deduction for rvalue references Clang 3.0
607[class.base.init] CD6 Lookup of mem-initializer-ids Clang 2.7
608[class.virtual] CD2 Determining the final overrider of a virtual function Clang 2.7
609[dcl.type.cv] CD4 What is a “top-level” cv-qualifier? Unknown
610[expr.unary.op] NAD Computing the negative of 0U Clang 2.7
611[dcl.init] CD2 Zero-initializing references Clang 2.7
612[intro.execution] CD2 Requirements on a conforming implementation N/A
613[class.mem] CD1 Unevaluated uses of non-static class members Clang 3.1 (C++11 onwards)
614[expr.mul] CD1 Results of integer / and % Clang 2.7
615[dcl.init] C++11 Incorrect description of variables that can be initialized Clang 2.7
616[intro.defs] CD3 Definition of “indeterminate value” Clang 4
617[conv.lval] NAD Lvalue-to-rvalue conversions of uninitialized char objects Unknown
618[cpp.cond] CD2 Casts in preprocessor conditional expressions Clang 2.7
619[basic.types] C++11 Completeness of array types Clang 3.4
620[class.mem] CD1 Declaration order in layout-compatible POD structs Duplicate of 568
621[temp.expl.spec] C++11 Template argument deduction from function return types Clang 2.7
622[expr.rel] NAD Relational comparisons of arbitrary pointers Unknown
623[basic.stc.dynamic.deallocation] CD3 Use of pointers to deallocated storage N/A
624[expr.new] CD1 Overflow in calculating size of allocation Unknown
625[dcl.spec.auto] CD2 Use of auto as a template-argument Clang 2.9
626[cpp.stringize] CD2 Preprocessor string literals Clang 2.7
627[basic.fundamental] NAD Values behaving as types Clang 2.7
628[dcl.enum] CD2 The values of an enumeration with no enumerator N/A
629[dcl.spec.auto] CD1 auto parsing ambiguity Clang 2.9
630[lex.charset] CD2 Equality of narrow and wide character values in the basic character set Clang 2.7
631[stmt.if] CD3 Jumping into a “then” clause N/A
632[dcl.init.aggr] CD1 Brace-enclosed initializer for scalar member of aggregate Clang 2.7
633[basic.pre] CD2 Specifications for variables that should also apply to references N/A
634[expr.call] CD1 Conditionally-supported behavior for non-POD objects passed to ellipsis redux Clang 2.7
635[class.qual] NAD Names of constructors and destructors of templates Clang 2.7
636[basic.lval] CD4 Dynamic type of objects and aliasing Unknown
637[intro.execution] CD1 Sequencing rules and example disagree Clang 3.0
638[temp.friend] CD2 Explicit specialization and friendship No
639[intro.execution] CD1 What makes side effects “different” from one another? Clang 3.3
640[basic.start.dynamic] NAD Accessing destroyed local objects of static storage duration Unknown
641[over.match.viable] CD2 Overload resolution and conversion-to-same-type operators Clang 2.7
642[basic.scope.block] CD2 Definition and use of “block scope” and “local scope” Clang 2.7
643[dcl.type.simple] NAD Use of decltype in a class member-specification Clang 3.2
644[basic.types] CD1 Should a trivial class type be a literal type? Partial
645[class.mem] CD2 Are bit-field and non-bit-field members layout compatible? N/A
646[basic.types] NAD Can a class with a constexpr copy constructor be a literal type? Superseded by 981
647[dcl.constexpr] CD1 Non-constexpr instances of constexpr constructor templates Clang 3.1
648[dcl.constexpr] CD1 Constant expressions in constexpr initializers Clang 2.7
649[basic.align] CD1 Optionally ill-formed extended alignment requests Clang 3.5
650[class.temporary] CD2 Order of destruction for temporaries bound to the returned value of a function Clang 2.8
651[dcl.type.simple] CD1 Problems in decltype specification and examples Clang 2.7
652[expr.const] CD2 Compile-time evaluation of floating-point expressions Clang 3.1
653[class.copy.assign] CD2 Copy assignment of unions Clang 2.7
654[conv.ptr] CD1 Conversions to and from nullptr_t Superseded by 1423
655[class.base.init] C++11 Initialization not specified for forwarding constructors Clang 3.0
656[dcl.init.ref] CD2 Direct binding to the result of a conversion operator Clang 2.8
657[temp.deduct] CD2 Abstract class parameter in synthesized declaration Partial
658[expr.reinterpret.cast] CD2 Defining reinterpret_cast for pointer types Clang 2.7
659[expr.alignof] CD1 Alignment of function types Clang 3.0
660[dcl.enum] CD1 Unnamed scoped enumerations Clang 3.0
661[expr.rel] CD1 Semantics of arithmetic comparisons Clang 2.7
662[temp.deduct] NAD Forming a pointer to a reference type Clang 2.7
663[extendid] CD1 Valid Cyrillic identifier characters Superseded by P1949
664[dcl.init.ref] CD2 Direct binding of references to non-class rvalue references Clang 2.7
665[expr.dynamic.cast] CD2 Problems in the specification of dynamic_cast Clang 2.8
666[temp.res] CD1 Dependent qualified-ids without the typename keyword Clang 2.8
667[class.copy.ctor] CD2 Trivial special member functions that cannot be implicitly defined Clang 8
668[except.terminate] CD2 Throwing an exception from the destructor of a local static object Unknown
669[dcl.type.simple] NAD Confusing specification of the meaning of decltype Clang 3.1
670[dcl.init] CD4 Copy initialization via derived-to-base conversion in the second step Unknown
671[expr.static.cast] CD1 Explicit conversion from a scoped enumeration type to integral type Clang 2.9
672[expr.new] CD2 Sequencing of initialization in new-expressions Clang 2.7
673[namespace.memdef] NAD Injection of names from elaborated-type-specifiers in friend declarations Clang 2.7
674[temp.friend] C++11 “matching specialization” for a friend declaration Clang 8
675[class.bit] CD3 Signedness of bit-field with typedef or template parameter type Duplicate of 739
676[basic.def] C++11 static_assert-declarations and general requirements for declarations N/A
677[class.dtor] CD1 Deleted operator delete and virtual destructors No
678[basic.def.odr] C++11 Language linkage of member function parameter types and the ODR Unknown
679[temp.type] CD1 Equivalence of template-ids and operator function templates Clang 2.7
680[class.copy.ctor] CD2 What is a move constructor? N/A
681[dcl.fct] CD1 Restrictions on declarators with late-specified return types Partial
682[basic.lookup.classref] CD5 Missing description of lookup of template aliases Unknown
683[class.copy.ctor] CD1 Requirements for trivial subobject special functions Clang 3.3
684[expr.const] CD1 Constant expressions involving the address of an automatic variable Superseded by 1454
685[conv.prom] CD2 Integral promotion of enumeration ignores fixed underlying type Clang 10
686[dcl.name] CD1 Type declarations/definitions in type-specifier-seqs and type-ids Clang 3.0
687[expr.prim.general] NAD template keyword with unqualified-ids Unknown
688[basic.start.static] CD1 Constexpr constructors and static initialization Unknown
689[basic.fundamental] CD5 Maximum values of signed and unsigned integers Unknown
690[intro.defs] CD2 The dynamic type of an rvalue reference Unknown
691[temp.param] C++11 Template parameter packs in class template partial specializations Unknown
692[temp.deduct.type] C++11 Partial ordering of variadic class template partial specializations Clang 16
693[conv.array] CD2 New string types and deprecated conversion Unknown
694[dcl.init] C++11 Zero- and value-initialization of union objects Unknown
695[expr] CD2 Compile-time calculation errors in constexpr functions Unknown
696[class.local] C++11 Use of block-scope constants in local classes Clang 3.1
697[temp.deduct] open Deduction rules apply to more than functions Not resolved
698[intro.execution] open The definition of “sequenced before” is too narrow Not resolved
699[dcl.constexpr] CD2 Must constexpr member functions be defined in the class member-specification? Unknown
700[dcl.constexpr] C++11 Constexpr member functions of class templates Unknown
701[dcl.array] CD2 When is the array-to-pointer conversion applied? Unknown
702[over.ics.rank] CD2 Preferring conversion to std::initializer_list Unknown
703[dcl.init.list] CD2 Narrowing for literals that cannot be exactly represented Unknown
704[over.match.call] CD2 To which postfix-expressions does overload resolution apply? Unknown
705[basic.lookup.argdep] CD2 Suppressing argument-dependent lookup via parentheses Clang 2.7
706[dcl.spec.auto] NAD Use of auto with rvalue references Unknown
707[conv.fpint] CD2 Undefined behavior in integral-to-floating conversions Unknown
708[temp.spec.partial] open Partial specialization of member templates of class templates Not resolved
709[temp.deduct] C++11 Enumeration names as nested-name-specifiers in deduction failure Unknown
710[class.cdtor] CD2 Data races during construction Unknown
711[dcl.spec.auto] CD2 auto with braced-init-list Unknown
712[basic.def.odr] CD3 Are integer constant operands of a conditional-expression “used?” Partial
713[dcl.fct] CD2 Unclear note about cv-qualified function types Clang 3.0
714[class.static.data] CD2 Static const data members and braced-init-lists Unknown
715[expr.const] CD2 Class member access constant expressions Unknown
716[class.union] CD2 Specifications that should apply only to non-static union data members Unknown
717[dcl.stc] CD2 Unintentional restrictions on the use of thread_local Unknown
718[class.friend] NAD Non-class, non-function friend declarations Unknown
719[basic.pre] CD2 Specifications for operator-function-id that should also apply to literal-operator-id Unknown
720[expr.prim.lambda] CD2 Need examples of lambda-expressions Unknown
721[expr.const] CD2 Where must a variable be initialized to be used in a constant expression? Unknown
722[expr.call] CD2 Can nullptr be passed to an ellipsis? Clang 20
726[intro.multithread] CD2 Atomic and non-atomic objects in the memory model Unknown
727[temp.expl.spec] C++17 In-class explicit specializations Partial
728[temp] NAD Restrictions on local classes Unknown
729[except.handle] CD3 Qualification conversions and handlers of reference-to-pointer type Unknown
730[temp.expl.spec] CD2 Explicit specializations of members of non-template classes Unknown
731[expr.ref] CD2 Omitted reference qualification of member function type Unknown
732[dcl.fct.def] CD2 Late-specified return types in function definitions Unknown
733[class.copy.assign] NAD Reference qualification of copy assignment operators Unknown
734[expr.reinterpret.cast] CD2 Are unique addresses required for namespace-scope variables? Unknown
735[basic.stc.dynamic.safety] CD2 Missing case in specification of safely-derived pointers Unknown
736[dcl.decl] NAD Is the & ref-qualifier needed? Unknown
737[dcl.init.string] CD2 Uninitialized trailing characters in string initialization Unknown
738[class.ctor] C++11 constexpr not permitted by the syntax of constructor declarations Unknown
739[class.bit] CD3 Signedness of plain bit-fields Unknown
740[intro.multithread] CD2 Incorrect note on data races Unknown
741[class.bit] C++11 “plain” long long bit-fields Unknown
742[expr.post.incr] open Postfix increment/decrement with long bit-field operands Not resolved
743[expr.prim.general] CD2 Use of decltype in a nested-name-specifier Unknown
744[temp.arg.template] CD2 Matching template arguments with template template parameters with parameter packs Unknown
745[cpp.error] C++23 Effect of ill-formedness resulting from #error Unknown
746[dcl.spec.auto] CD2 Use of auto in new-expressions Unknown
747[class.access.base] dup Access of protected base classes Unknown
749[over.built] CD2 References to function types with a cv-qualifier or ref-qualifier Unknown
750[expr.prim.lambda.closure] CD2 Implementation constraints on reference-only closure objects Unknown
751[expr.prim.lambda.closure] CD2 Deriving from closure classes Unknown
752[expr.prim.lambda] CD2 Name lookup in nested lambda-expressions Unknown
753[expr.prim.lambda.capture] CD2 Array names in lambda capture sets Unknown
754[expr.prim.lambda] CD2 Lambda expressions in default arguments of block-scope function declarations Unknown
755[expr.prim.lambda.capture] CD3 Generalized lambda-captures Unknown
756[expr.prim.lambda.closure] CD2 Dropping cv-qualification on members of closure objects Unknown
757[basic.link] CD2 Types without linkage in declarations Unknown
758[basic.def] C++11 Missing cases of declarations that are not definitions Unknown
759[expr.prim.lambda.closure] CD2 Destruction of closure objects Unknown
760[expr.prim.general] CD2 this inside a nested class of a non-static member function Unknown
761[expr.prim.lambda.closure] CD2 Inferred return type of closure object call operator Unknown
762[expr.prim.lambda] CD2 Name lookup in the compound-statement of a lambda expression Unknown
763[expr.prim.lambda.closure] CD2 Is a closure object's operator() inline? Unknown
764[expr.prim.lambda.capture] CD2 Capturing unused variables in a lambda expression Unknown
765[dcl.fct.spec] CD2 Local types in inline functions with external linkage Unknown
766[expr.prim.lambda] CD2 Where may lambda expressions appear? Unknown
767[expr.prim.lambda] CD2 void and other unnamed lambda-parameters Unknown
768[expr.prim.lambda] CD2 Ellipsis in a lambda parameter list Unknown
769[expr.prim.lambda] CD2 Initialization of closure objects Unknown
770[dcl.decl] CD2 Ambiguity in late-specified return type Unknown
771[expr.prim.lambda.closure] CD2 Move-construction of reference members of closure objects Unknown
772[expr.prim.lambda.capture] CD2 capture-default in lambdas in local default arguments Unknown
773[temp.arg.nontype] C++11 Parentheses in address non-type template arguments Unknown
774[expr.prim.lambda.closure] CD2 Can a closure class be a POD? Unknown
775[expr.prim.lambda.capture] CD2 Capturing references to functions Unknown
776[basic.start.dynamic] CD2 Delegating constructors, destructors, and std::exit Unknown
777[dcl.fct.default] CD2 Default arguments and parameter packs Clang 3.7
778[temp.param] C++11 Template parameter packs in non-type template parameters Unknown
779[expr.prim.lambda.closure] CD2 Rvalue reference members of closure objects? Unknown
782[expr.prim.lambda] CD2 Lambda expressions and argument-dependent lookup Unknown
783[intro.defs] open Definition of “argument” Not resolved
784[intro.structure] C++11 List of incompatibilities with the previous Standard Unknown
785[intro.execution] CD2 “Execution sequence” is inappropriate phraseology Unknown
786[intro.multithread] CD2 Definition of “thread” Unknown
787[lex.phases] CD2 Unnecessary lexical undefined behavior Clang 21
788[lex.charset] CD2 Relationship between locale and values of the execution character set Unknown
789[lex.trigraph] CD2 Deprecating trigraphs Unknown
790[lex.string] CD2 Concatenation of raw and non-raw string literals Unknown
792[basic.start.main] CD2 Effects of std::quick_exit Unknown
793[basic.life] CD2 Use of class members during destruction Unknown
794[conv.mem] NAD Base-derived conversion in member type of pointer-to-member conversion Clang 2.7
795[expr.prim.lambda] NAD Dependency of lambdas on <functional> Unknown
796[expr.prim.lambda] CD2 Lifetime of a closure object with members captured by reference Unknown
797[expr.prim.lambda.closure] CD2 Converting a no-capture lambda to a function type Unknown
798[expr.sub] C++11 Overloaded subscript operator described in clause 5 Unknown
799[expr.reinterpret.cast] CD2 Can reinterpret_cast be used to cast an operand to its own type? Unknown
800[expr.reinterpret.cast] NAD Safely-derived pointers and object pointers converted from function pointers Unknown
801[expr.const.cast] CD2 Casting away constness in a cast to rvalue reference type Unknown
803[expr.sizeof] CD2 sizeof an enumeration type with a fixed underlying type Unknown
804[expr.new] CD2 Deducing the type in new auto(x) Unknown
805[expr.new] CD2 Which exception to throw for overflow in array size calculation Unknown
806[expr.const] CD2 Enumeration types in integral constant expressions Unknown
807[expr.const] NAD typeid expressions in constant expressions Unknown
808[dcl.spec] CD2 Non-type decl-specifiers versus max-munch Unknown
809[dcl.stc] CD2 Deprecation of the register keyword Unknown
810[dcl.stc] CD2 Block-scope thread_local variables should be implicitly static Unknown
811[dcl.type.cv] CD2 Unclear implications of const-qualification Unknown
812[namespace.def] CD2 Duplicate names in inline namespaces Unknown
813[namespace.udecl] open typename in a using-declaration with a non-dependent name Not resolved
814[dcl.attr] CD2 Attribute to indicate that a function throws nothing Unknown
815[dcl.attr.grammar] CD2 Parameter pack expansion inside attributes Unknown
816[dcl.attr.final] CD2 Diagnosing violations of [[final]] Unknown
817[dcl.attr.final] CD2 Meaning of [[final]] applied to a class definition Unknown
818[dcl.fct] CD2 Function parameter packs in non-final positions Unknown
819[special] NAD Access control and deleted implicitly-declared special member functions Unknown
820[temp] CD2 Deprecation of export Clang 2.7
822[temp] NAD Additional contexts for template aliases Unknown
823[temp.arg.nontype] CD2 Literal types with constexpr conversions as non-type template arguments Unknown
828[except.throw] CD2 Destruction of exception objects Unknown
829[except.spec] NAD At what point is std::unexpected called? Unknown
830[except.spec] CD2 Deprecating exception specifications Unknown
831[implimits] CD2 Limit on recursively nested template instantiations Unknown
832[lex.ppnumber] CD2 Value of preprocessing numbers Unknown
833[expr.static.cast] CD2 Explicit conversion of a scoped enumeration value to a floating type Unknown
834[lex.string] CD2 What is an “ordinary string literal”? Unknown
835[expr] CD2 Scoped enumerations and the “usual arithmetic conversions” Unknown
836[dcl.attr.noreturn] NAD [[noreturn]] applied to function types Unknown
837[dcl.constexpr] C++11 Constexpr functions and return braced-init-list Unknown
838[class.base.init] C++11 Use of this in a brace-or-equal-initializer Unknown
839[expr.sizeof] dup sizeof with opaque enumerations Unknown
840[temp.param] CD2 Rvalue references as nontype template parameters Unknown
842[expr.reinterpret.cast] CD2 Casting to rvalue reference type Unknown
845[dcl.fct.def] CD2 What is the “first declaration” of an explicit specialization? Unknown
846[basic.lval] CD2 Rvalue references to functions Unknown
847[temp.deduct.call] CD2 Error in rvalue reference deduction example Unknown
850[expr.prim.general] CD2 Restrictions on use of non-static data members Unknown
852[namespace.udecl] CD6 using-declarations and dependent base classes Unknown
853[basic.stc.dynamic.safety] CD2 Support for relaxed pointer safety Unknown
854[expr.shift] CD2 Left shift and unsigned extended types Unknown
855[expr.assign] CD2 Incorrect comments in braced-init-list assignment example Unknown
858[expr] CD2 Example binding an rvalue reference to an lvalue Unknown
860[dcl.constexpr] C++11 Explicit qualification of constexpr member functions Unknown
861[namespace.qual] CD2 Unintended ambiguity in inline namespace lookup Unknown
862[dcl.enum] CD2 Undefined behavior with enumerator value overflow Unknown
863[expr.post] CD2 Rvalue reference cast to incomplete type Unknown
864[stmt.ranged] C++11 braced-init-list in the range-based for statement Unknown
865[dcl.init.list] CD2 Initializing a std::initializer_list Unknown
869[dcl.init] CD2 Uninitialized thread_local objects Unknown
872[lex.string] CD2 Lexical issues with raw strings Unknown
873[temp.deduct.type] C++11 Deducing rvalue references in declarative contexts Clang 3.0
874[class.mem] CD2 Class-scope definitions of enumeration types Unknown
876[temp.deduct.call] CD2 Type references in rvalue reference deduction specification Unknown
877[over.match.viable] CD2 Viable functions and binding references to rvalues Unknown
879[over.built] CD2 Missing built-in comparison operators for pointer types Unknown
880[over.built] CD2 Built-in conditional operator for scoped enumerations Unknown
882[basic.start.main] CD2 Defining main as deleted Clang 3.5
883[basic.types] CD2 std::memcpy vs std::memmove Unknown
884[temp.expl.spec] CD2 Defining an explicitly-specialized static data member Unknown
885[temp.deduct.partial] NAD Partial ordering of function templates with unordered parameter pairs Unknown
886[dcl.init.aggr] CD2 Member initializers and aggregates Unknown
887[class.copy.ctor] CD2 Move construction of thrown object Unknown
888[class.base.init] CD2 Union member initializers Unknown
891[expr.const.cast] CD2 const_cast to rvalue reference from objectless rvalue Unknown
892[dcl.constexpr] C++11 Missing requirements for constexpr constructors Unknown
893[dcl.enum] NAD Brace syntax for enumerator-definitions Unknown
896[dcl.init.ref] CD2 Rvalue references and rvalue-reference conversion functions Unknown
897[cpp.pragma.op] open _Pragma and extended string-literals Not resolved
898[dcl.constexpr] C++11 Declarations in constexpr functions Unknown
899[over.match.copy] CD2 Explicit conversion functions in direct class initialization Unknown
900[class.temporary] C++23 Lifetime of temporaries in range-based for Unknown
901[expr.new] open Deleted operator delete Not resolved
902[class.static.data] NAD In-class initialization of non-constant static data members Unknown
903[temp.dep.constexpr] CD3 Value-dependent integral null pointer constants Unknown
904[expr.prim.lambda.capture] CD2 Parameter packs in lambda-captures Unknown
905[class] CD2 Explicit defaulted copy constructors and trivial copyability Unknown
906[dcl.fct.def] CD2 Which special member functions can be defaulted? Unknown
908[dcl.fct.def] CD2 Deleted global allocation and deallocation functions Unknown
909[expr.cast] NAD Old-style casts with conversion functions Unknown
910[class.copy.ctor] CD2 Move constructors and implicitly-declared copy constructors Unknown
912[lex.ccon] CD3 Character literals and universal-character-names Unknown
913[temp.deduct.conv] CD2 Deduction rules for array- and function-type conversion functions Unknown
914[expr.type.conv] open Value-initialization of array types Not resolved
915[dcl.fct.def] CD2 Deleted specializations of member function templates Unknown
919[namespace.def] CD2 Contradictions regarding inline namespaces Unknown
920[dcl.meaning] CD2 Interaction of inline namespaces and using-declarations Unknown
921[namespace.def] CD2 Unclear specification of inline namespaces Unknown
922[class.ctor] CD2 Implicit default constructor definitions and const variant members Unknown
923[temp.expl.spec] CD2 Inline explicit specializations Unknown
924[class.mem] C++11 alias-declaration as a class member Unknown
925[cpp.cond] open Type of character literals in preprocessor expressions Not resolved
926[namespace.unnamed] CD2 Inline unnamed namespaces Unknown
927[class.ctor] CD2 Implicitly-deleted default constructors and member initializers Unknown
928[dcl.fct.def] CD2 Defaulting a function that would be implicitly defined as deleted Unknown
929[temp.alias] CD2 What is a template alias? Unknown
930[expr.alignof] CD2 alignof with incomplete array type Clang 2.7
931[lex.ext] CD2 Confusing reference to the length of a user-defined string literal Unknown
932[lex.string] CD2 UCNs in closing delimiters of raw string literals Unknown
933[lex.ccon] CD2 32-bit UCNs with 16-bit wchar_t Unknown
934[dcl.init.list] CD2 List-initialization of references Unknown
935[over.literal] CD2 Missing overloads for character types for user-defined literals Unknown
936[dcl.init.string] CD2 Array initialization with new string literals Unknown
937[lex.ext] NAD Restrictions on values of template arguments in user-defined literals Unknown
938[dcl.init.aggr] C++11 Initializer lists and array new Unknown
939[class.virtual] CD2 Explicitly checking virtual function overriding Unknown
940[dcl.stc] CD2 Global anonymous unions Unknown
941[temp.expl.spec] C++11 Explicit specialization of deleted function template Unknown
942[basic.pre] CD2 Is this an entity? Unknown
943[expr.type.conv] CD5 Is T() a temporary? Unknown
944[expr.reinterpret.cast] NAD reinterpret_cast for all types with the same size and alignment Unknown
945[expr.prim.general] C++11 Use of this in a late-specified return type Unknown
946[basic.start.dynamic] CD2 Order of destruction of local static objects and calls to std::atexit Unknown
947[temp.over] NAD Deducing type template arguments from default function arguments Unknown
948[stmt.select] C++11 constexpr in conditions Clang 3.7
949[intro.compliance] open Requirements for freestanding implementations Not resolved
950[dcl.type.simple] CD2 Use of decltype as a class-name Unknown
951[dcl.attr] CD2 Problems with attribute-specifiers Unknown
952[class.access.base] CD6 Insufficient description of “naming class” Clang 2.8
953[over.ics.ref] CD2 Rvalue references and function viability Unknown
954[over.built] open Overload resolution of conversion operator templates with built-in types Not resolved
955[expr.prim.lambda.closure] CD2 Can a closure type's operator() be virtual? Unknown
956[dcl.fct] CD2 Function prototype scope with late-specified return types Unknown
957[dcl.attr.grammar] CD2 Alternative tokens and attribute-tokens Unknown
958[expr.prim.lambda] NAD Lambdas and decltype Unknown
959[dcl.align] CD2 Alignment attribute for class and enumeration types Unknown
960[class.virtual] CD2 Covariant functions and lvalue/rvalue references Clang 3.0
961[over.ics.rank] CD2 Overload resolution and conversion of std::nullptr_t to bool Unknown
962[dcl.type.elab] CD2 Attributes appertaining to class and enum types Unknown
963[expr.rel] CD2 Comparing nullptr with 0 Unknown
964[basic.lval] C++11 Incorrect description of when the lvalue-to-rvalue conversion applies Unknown
965[dcl.attr.depend] CD2 Limiting the applicability of the carries_dependency attribute Unknown
966[basic.link] CD2 Nested types without linkage Unknown
967[basic.stc.dynamic] NAD Exception specification of replacement allocation function Unknown
968[dcl.attr.grammar] CD2 Syntactic ambiguity of the attribute notation Unknown
969[temp.explicit] CD2 Explicit instantiation declarations of class template specializations Unknown
970[dcl.attr] CD2 Consistent use of “appertain” and “apply” Unknown
971[except.handle] C++11 Incorrect treatment of exception-declarations Unknown
972[dcl.attr.grammar] C++11 Allowing multiple attribute-specifiers Unknown
973[except.spec] CD2 Function types in exception-specifications Unknown
974[expr.prim.lambda] CD3 Default arguments for lambdas Clang 3.3
975[expr.prim.lambda] CD3 Restrictions on return type deduction for lambdas Unknown
976[temp.deduct.conv] CD2 Deduction for const T& conversion operators Unknown
977[dcl.enum] CD3 When is an enumeration type complete? Clang 2.7
978[over.best.ics] CD2 Incorrect specification for copy initialization Unknown
979[dcl.decl] CD2 Position of attribute-specifier in declarator syntax Unknown
980[temp.explicit] CD2 Explicit instantiation of a member of a class template Unknown
981[basic.types] C++11 Constexpr constructor templates and literal types Unknown
982[dcl.init.list] NAD Initialization with an empty initializer list Unknown
983[expr.unary.op] CD2 Ambiguous pointer-to-member constant Unknown
984[dcl.spec.auto] CD2 “Deduced type” is unclear in auto type deduction Unknown
985[lex.digraph] C++11 Alternative tokens and user-defined literals Unknown
986[namespace.udir] CD2 Transitivity of using-directives versus qualified lookup Unknown
987[basic.namespace] CD4 Which declarations introduce namespace members? Unknown
988[dcl.type.simple] CD2 Reference-to-reference collapsing with decltype Unknown
989[dcl.init.list] CD2 Misplaced list-initialization example Unknown
990[dcl.init.list] CD2 Value initialization with multiple initializer-list constructors Clang 3.5
991[dcl.constexpr] CD2 Reference parameters of constexpr functions and constructors Unknown
992[class.copy.ctor] NAD Inheriting explicitness Unknown
993[temp.point] C++11 Freedom to perform instantiation at the end of the translation unit Unknown
994[dcl.fct] C++11 braced-init-list as a default argument Unknown
995[temp.explicit] CD2 Incorrect example for using-declaration and explicit instantiation Unknown
996[temp.spec.partial] C++11 Ambiguous partial specializations of member class templates Unknown
997[basic.lookup.argdep] C++11 Argument-dependent lookup and dependent function template parameter types Unknown
998[dcl.fct] dup Function parameter transformations and template functions Unknown
999[over.match] CD2 “Implicit” or “implied” object argument/parameter? Unknown
1000[class.qual] CD2 Mistaking member typedefs for constructors Unknown
1001[dcl.fct] review Parameter type adjustment in dependent parameter types Not resolved
1002[temp.variadic] NAD Pack expansion for function arguments Unknown
1003[basic.start.main] CD3 Acceptable definitions of main Unknown
1004[temp.local] C++11 Injected-class-names as arguments for template template parameters Clang 5
1005[class.mfct.non.static] NAD Qualified name resolution in member functions of class templates Unknown
1006[temp.param] C++11 std::nullptr_t as a non-type template parameter Unknown
1007[class.protected] NAD Protected access and pointers to members Unknown
1008[expr.alignof] NAD Querying the alignment of an object Unknown
1009[temp] C++11 Missing cases in the declarator-id of a function template declaration Unknown
1010[expr.const] CD2 Address of object with dynamic storage duration in constant expression Unknown
1011[expr.static.cast] C++11 Standard conversions that cannot be inverted Unknown
1012[namespace.unnamed] C++11 Undeprecating static Unknown
1013[conv.lval] CD3 Uninitialized std::nullptr_t objects Unknown
1014[temp.deduct.call] NAD Overload resolution between const T& and T&& Unknown
1015[basic.lookup.argdep] C++11 Template arguments and argument-dependent lookup Unknown
1016[over] C++11 Overloadable declarations, function templates, and references Unknown
1017[class.mfct.non.static] C++11 Member access transformation in unevaluated operands Unknown
1018[dcl.pre] C++11 Ambiguity between simple-declaration and attribute-declaration Unknown
1019[class.derived] dup Dependent simple-template-ids in base-specifiers and mem-initializers Unknown
1020[class.copy.ctor] C++11 Implicitly-defined copy constructors and explicit base class constructors Unknown
1021[namespace.memdef] CD4 Definitions of namespace members Unknown
1022[dcl.enum] C++11 Can an enumeration variable have values outside the values of the enumeration? Unknown
1023[temp.arg.nontype] dup thread_local objects as non-type template arguments Unknown
1024[lex.ccon] CD3 Limits on multicharacter literals Unknown
1025[temp.arg.nontype] C++11 Use of a reference as a non-type template argument Unknown
1026[basic.lval] NAD Cv-qualified non-class rvalues Unknown
1027[basic.life] review Type consistency and reallocation of scalar types Not resolved
1028[temp.dep.res] CD6 Dependent names in non-defining declarations Unknown
1029[class.dtor] C++11 Type of a destructor call Unknown
1030[dcl.init.aggr] C++11 Evaluation order in initializer-lists used in aggregate initialization Unknown
1031[dcl.attr.grammar] C++11 Optional elements in attributes Unknown
1032[temp.variadic] C++11 Empty pack expansions Unknown
1033[dcl.align] C++11 Restrictions on alignment attributes Unknown
1034[expr.prim.lambda] C++11 Attributes for return statements in lambdas Unknown
1035[class.mem] C++11 Omitted and required decl-specifiers Unknown
1036[dcl.align] C++11 Alignment attribute in an exception-declaration Unknown
1037[expr.delete] C++11 Requirements for operands of delete-expressions and deallocation functions Unknown
1038[over.over] CD7 Overload resolution of &x.static_func Unknown
1039[dcl.align] dup Coordinating C and C++ alignment specifications Unknown
1040[intro.multithread] NAD Memory model issues Unknown
1041[class.mem] dup alias-declarations as class members Unknown
1042[dcl.pre] C++11 Attributes in alias-declarations Clang 3.5
1043[temp.dep.type] C++11 Qualified name lookup in the current instantiation Unknown
1044[basic.scope.pdecl] C++11 Point of declaration for an alias-declaration Unknown
1045[temp.explicit] NAD Requiring explicit instantiation declarations Unknown
1046[temp.explicit] open What is a “use” of a class specialization? Not resolved
1047[temp.dep.constexpr] C++11 When is typeid value-dependent? Unknown
1048[expr.prim.lambda] CD3 auto deduction and lambda return type deduction. Clang 3.6
1049[class.copy.elision] open Copy elision through reference parameters of inline functions Not resolved
1050[basic.life] NAD Effects of thread support on object lifetime Unknown
1051[class.copy.ctor] C++11 Reference members and generated copy constructors Unknown
1052[class.copy.ctor] dup const non-static data member and PODness Unknown
1053[except.spec] NAD Terminate vs undefined behavior for noexcept violation Unknown
1054[stmt.expr] C++11 Lvalue-to-rvalue conversions in expression statements No
1055[basic.fundamental] C++11 Permissible uses of void Unknown
1056[temp.alias] C++11 Template aliases, member definitions, and the current instantiation Unknown
1057[temp.dep.type] C++11 decltype and the current instantiation Unknown
1058[dcl.init.ref] NAD Reference binding of incompatible array types Unknown
1059[basic.type.qualifier] CD3 Cv-qualified array types (with rvalues) Unknown
1060[expr.const] C++11 Scoped enumerators in integral constant expressions Unknown
1061[expr.new] C++11 Negative array bounds in a new-expression Unknown
1062[expr.prim.lambda] C++11 Syntax of attribute-specifiers in lambdas Unknown
1063[dcl.attr.override] C++11 [[hiding]] with non-attribute declarations Unknown
1064[class.copy.ctor] C++11 Defaulted move constructor for a union Unknown
1065[dcl.attr.override] C++11 [[hiding]] with [[override]] Unknown
1066[class.copy.assign] C++11 When is a copy/move assignment operator implicitly defined? Unknown
1067[dcl.attr.override] NAD [[hiding]], using-declarations, and multiple inheritance Unknown
1068[temp.param] C++11 Template aliases with default arguments and template parameter packs Unknown
1069[dcl.fct] C++11 Incorrect function type with trailing-return-type Unknown
1070[dcl.init.aggr] C++11 Missing initializer clauses in aggregate initialization Clang 3.5
1071[basic.types] C++11 Literal class types and trivial default constructors Unknown
1072[class.mem] C++11 Scoped enumerator with the same name as its containing class Unknown
1073[except.spec] C++11 Merging dynamic-exception-specifications and noexcept-specifications Unknown
1074[temp.dep.constexpr] C++11 Value-dependent noexcept-expressions Unknown
1075[dcl.type.simple] C++11 Grammar does not allow template alias in type-name Unknown
1076[basic.lval] CD5 Value categories and lvalue temporaries Unknown
1077[namespace.memdef] NAD Explicit specializations in non-containing namespaces Unknown
1078[dcl.init.list] NAD Narrowing and the usual arithmetic conversions Unknown
1079[over.ics.rank] C++11 Overload resolution involving aggregate initialization Unknown
1080[class.copy.ctor] C++11 Confusing relationship between templates and copy constructors Unknown
1081[class.dtor] C++11 Defaulted destructor and unusable operator delete Unknown
1082[class.copy.ctor] C++11 Implicit copy function if subobject has none? Unknown
1083[expr.call] C++11 Passing an object to ellipsis with non-trivial move constructor Unknown
1084[class.copy.ctor] NAD Conditions for a deleted move function Unknown
1085[class.copy.assign] NAD Move assignment operators and virtual bases Unknown
1086[expr.const.cast] C++11 const_cast to rvalue reference to function type Unknown
1087[over.match.copy] C++11 Additional applications of issue 899 Unknown
1088[temp.dep.constexpr] C++11 Dependent non-type template arguments Unknown
1089[basic.lookup.qual.general] open Template parameters in member selections Not resolved
1090[basic.align] C++11 Alignment of subobjects Unknown
1091[expr.mptr.oper] C++11 Inconsistent use of the term “object expression” Unknown
1092[class.copy.ctor] drafting Cycles in overload resolution during instantiation Not resolved
1093[dcl.init] CD3 Value-initializing non-objects Unknown
1094[expr.static.cast] C++11 Converting floating-point values to scoped enumeration types Unknown
1095[dcl.init.list] C++11 List-initialization of references Unknown
1096[temp] C++11 Missing requirement for template definitions Unknown
1097[dcl.init.aggr] NAD Aggregate initialization of function parameters Unknown
1098[expr.const] C++11 Pointer conversions in constant expressions Unknown
1099[expr.const] C++11 Infinite recursion in constexpr functions Unknown
1100[expr.const] C++11 constexpr conversion functions and non-type template arguments Unknown
1101[class.static.data] C++11 Non-integral initialized static data members Unknown
1102[intro.execution] C++11 Better example of undefined behavior Unknown
1103[lex.phases] C++11 Reversion of phase 1 and 2 transformations in raw string literals Unknown
1104[lex.digraph] C++11 Global-scope template arguments vs the <: digraph Unknown
1105[lex.name] C++11 Issues relating to TR 10176:2003 Unknown
1106[lex.nullptr] C++11 Need more detail in nullptr keyword description Unknown
1107[lex.ext] C++11 Overload resolution for user-defined integer literals Unknown
1108[lex.ext] NAD User-defined literals have not been implemented Unknown
1109[basic.def.odr] C++11 When is “use” a reference to the ODR meaning? Unknown
1110[basic.def.odr] NAD Incomplete return type should be allowed in decltype operand Clang 3.1
1111[basic.lookup.classref] C++11 Remove dual-scope lookup of member template names Partial
1112[basic.link] C++11 constexpr variables should have internal linkage like const Unknown
1113[basic.link] C++11 Linkage of namespace member of unnamed namespace Partial
1114[basic.life] C++11 Incorrect use of placement new in example Unknown
1115[basic.align] C++11 C-compatible alignment specification Unknown
1116[basic.life] CD4 Aliasing of union members Unknown
1117[expr] C++11 Incorrect note about xvalue member access expressions Unknown
1118[expr.prim.lambda.capture] NAD Implicit lambda capture via explicit copy constructor Unknown
1119[expr.ref] C++11 Missing case in description of member access ambiguity Unknown
1120[expr.reinterpret.cast] C++11 reinterpret_cast and void* Unknown
1121[expr.unary.op] C++11 Unnecessary ambiguity error in formation of pointer to member Unknown
1122[expr.sizeof] C++11 Circular definition of std::size_t Unknown
1123[expr.unary.noexcept] C++11 Destructors should be noexcept by default Unknown
1124[expr.mptr.oper] NAD Error in description of value category of pointer-to-member expression Unknown
1125[expr.const] C++11 Unclear definition of “potential constant expression” Unknown
1126[expr.const] C++11 constexpr functions in const initializers Unknown
1127[expr.const] C++11 Overload resolution in constexpr functions Unknown
1128[dcl.spec] C++11 attribute-specifiers in decl-specifier-seqs Unknown
1129[dcl.constexpr] C++11 Default nothrow for constexpr functions Unknown
1130[dcl.type.simple] C++11 Function parameter type adjustments and decltype Unknown
1131[dcl.type.elab] C++11 Template aliases in elaborated-type-specifiers Unknown
1132[dcl.attr.noreturn] NAD Keyword vs attribute for noreturn Unknown
1133[dcl.attr.override] C++11 Keywords vs attributes for control of hiding and overriding Unknown
1134[dcl.fct.def.default] C++11 When is an explicitly-defaulted function defined? Unknown
1135[dcl.fct.def.default] C++11 Explicitly-defaulted non-public special member functions Unknown
1136[dcl.fct.def.default] C++11 Explicitly-defaulted explicit constructors Unknown
1137[dcl.fct.def.default] C++11 Explicitly-defaulted virtual special member functions Unknown
1138[dcl.init.ref] C++11 Rvalue-ness check for rvalue reference binding is wrong Unknown
1139[dcl.init.ref] C++11 Rvalue reference binding to scalar xvalues Unknown
1140[class] C++11 Incorrect redefinition of POD class Unknown
1141[class.mem] NAD Non-static data member initializers have not been implemented Unknown
1142[class.mfct] C++11 friend declaration of member function of containing class Unknown
1143[class.mfct.non.static] NAD Move semantics for *this have not been implemented Unknown
1144[class.access.dcl] C++11 Remove access declarations Unknown
1145[class.ctor] C++11 Defaulting and triviality Unknown
1146[class.dtor] C++11 exception-specifications of defaulted functions Unknown
1147[class.dtor] C++11 Destructors should be default nothrow Unknown
1148[class.copy.elision] C++11 Copy elision and move construction of function parameters Unknown
1149[class.copy.ctor] C++11 Trivial non-public copy operators in subobjects Unknown
1150[class.inhctor] NAD Inheriting constructors have not been implemented N/A
1151[over.match.list] C++11 Overload resolution with initializer-list and non-list constructors Unknown
1152[over.match.viable] C++11 Rules for determining existence of implicit conversion sequence Unknown
1153[over.over] C++11 Type matching in address of overloaded function Unknown
1154[temp.arg.nontype] C++11 Address of thread_local variable as non-type template argument Unknown
1155[temp.arg.nontype] C++11 Internal-linkage non-type template arguments Unknown
1156[temp.func.order] C++11 Partial ordering in a non-call context Unknown
1157[temp.func.order] open Partial ordering of function templates is still underspecified Not resolved
1158[temp.alias] C++11 Recursive instantiation via alias template Unknown
1159[temp.alias] C++11 Class and enumeration definitions in template aliases Unknown
1160[temp.dep.type] C++11 Definitions of template members and the current instantiation Unknown
1161[temp.res] C++11 Dependent nested-name-specifier in a pointer-to-member declarator Unknown
1162[temp.res] NAD Dependent elaborated-type-specifiers in non-deduced contexts Unknown
1163[temp.explicit] NAD extern template prevents inlining functions not marked inline Unknown
1164[temp.deduct.call] C++11 Partial ordering of f(T&) and f(T&&) Unknown
1165[except.ctor] C++11 Exceptions when destroying array elements Unknown
1166[except.handle] C++11 exception-declarations that do not declare objects Unknown
1167[except.spec] C++11 function-try-blocks for destructors Unknown
1168[except.terminate] C++11 Additional reasons to call std::terminate Unknown
1169[cpp.predefined] C++11 Missing feature macro for strict pointer safety Unknown
1170[temp.deduct] C++11 Access checking during template argument deduction Unknown
1171[except.terminate] C++11 Partial stack unwinding with noexcept violation Unknown
1172[temp.deduct] drafting “instantiation-dependent” constructs Not resolved
1173[intro.execution] C++11 Unclear specification of effects of signal handling Unknown
1174[basic.def.odr] C++11 When is a pure virtual function “used?” Unknown
1175[lex.ext] C++11 Disambiguating user-defined literals Unknown
1176[intro.multithread] C++11 Definition of release sequence Unknown
1177[intro.multithread] C++11 Intra-thread dependency-ordered-before Unknown
1178[temp.deduct.decl] C++11 Deduction failure matching placement new Unknown
1179[temp.param] NAD Cv-qualification of non-type template parameters Unknown
1180[basic.align] C++11 Over-aligned class types Unknown
1181[basic.types] C++11 What is a “built-in type?” Unknown
1182[temp.variadic] C++11 Incorrect description of pack expansion syntax Unknown
1183[dcl.fct] C++11 Expansion of parameter packs in declarators Unknown
1184[temp.deduct.call] C++11 Argument conversions to nondeduced parameter types Unknown
1185[dcl.link] C++11 Misleading description of language linkage and member function types Unknown
1186[dcl.constexpr] C++11 Non-dependent constexpr violations in function templates Unknown
1187[basic.start.static] C++11 Problems in initialization example Unknown
1188[expr.const] C++11 Type punning in constant expressions Unknown
1189[intro.object] C++11 Address of distinct base class subobjects Unknown
1190[basic.stc.dynamic.safety] C++11 Operations on non-safely-derived pointers Unknown
1191[class.ctor] C++11 Deleted subobject destructors and implicitly-defined constructors Unknown
1192[basic.def.odr] C++11 Inadvertent change to ODR and templates Unknown
1193[expr.const] C++11 Use of address-constant pointers in constant expressions Unknown
1194[dcl.constexpr] C++11 Constexpr references Unknown
1195[dcl.constexpr] C++11 References to non-literal types in constexpr functions Unknown
1196[temp.explicit] C++11 Definition required for explicit instantiation after explicit specialization? Unknown
1197[expr.const] C++11 Constexpr arrays Unknown
1198[basic.types] C++11 Literal types and copy constructors Unknown
1199[dcl.constexpr] C++11 Deleted constexpr functions Unknown
1200[basic.lookup.unqual] CD6 Lookup rules for template parameters N/A
1201[basic.def] C++11 Are deleted and defaulted functions definitions? Unknown
1202[class.cdtor] C++11 Calling virtual functions during destruction Unknown
1203[class.static.data] dup Misleading note regarding initialized static data members Unknown
1204[stmt.iter] C++11 Specifiers in a for-range-declaration Unknown
1205[over.ics.ref] dup Lvalue reference binding and function viability Unknown
1206[temp.class] C++11 Defining opaque enumeration members of class templates Unknown
1207[class.mfct.non.static] C++11 Type of class member in trailing-return-type Unknown
1208[class.mfct.non.static] C++11 Explicit noexcept in defaulted definition Unknown
1209[basic.def.odr] open Is a potentially-evaluated expression in a template definition a “use?” Not resolved
1210[basic.scope.pdecl] C++11 Injection of elaborated-type-specifier in enumeration scope Unknown
1211[basic.align] open Misaligned lvalues Not resolved
1212[dcl.type.simple] C++11 Non-function-call xvalues and decltype Unknown
1213[expr.sub] CD3 Array subscripting and xvalues Clang 7
1214[dcl.init] C++11 Kinds of initializers Unknown
1215[class] C++11 Definition of POD struct Unknown
1216[except.spec] C++11 Exceptions “allowed” by a noexcept-specification Unknown
1217[dcl.fct.def.delete] NAD Are deleted functions implicitly noexcept? Unknown
1218[except.handle] C++11 What is the “currently-handled exception” in a multi-threaded program? Unknown
1219[basic.types] C++11 Non-static data member initializers in constant expressions Unknown
1220[basic.lookup.classref] C++11 Looking up conversion-type-ids Unknown
1221[temp.deduct.partial] open Partial ordering and reference collapsing Not resolved
1222[dcl.array] NAD Unnecessary restriction on auto array types Unknown
1223[stmt.ambig] CD7 Syntactic disambiguation and trailing-return-types Clang 17
1224[class.copy.ctor] C++11 constexpr defaulted copy constructors Unknown
1225[dcl.constexpr] C++11 constexpr constructors and virtual bases Unknown
1226[dcl.fct.default] CD3 Converting a braced-init-list default argument Unknown
1227[temp.deduct] CD3 Mixing immediate and non-immediate contexts in deduction failure Clang 3.0
1228[over.match.list] NAD Copy-list-initialization and explicit constructors Unknown
1229[over.match.list] C++11 Overload resolution with empty braced-init-list argument Unknown
1230[expr.unary.op] dup Confusing description of ambiguity of destructor name Unknown
1231[temp.variadic] C++11 Variadic templates requiring an empty pack expansion Unknown
1232[dcl.init.list] C++11 Creation of array temporaries using a braced-init-list Unknown
1233[temp.dep] C++11 Pack expansions and dependent calls Unknown
1234[dcl.name] C++11 abstract-declarator does not permit ... after ptr-operator Unknown
1235[temp.func.order] C++11 “Unused” ellipsis and default arguments in partial ordering Unknown
1236[dcl.init.ref] C++11 Inconsistently-interrelated examples Unknown
1237[class.temporary] C++11 Deprecated implicit copy assignment in example Unknown
1238[over.ics.rank] C++11 Overloading ambiguity binding reference to function Unknown
1239[lex.ext] C++11 Hexadecimal floating-point literals vs user-defined literals Unknown
1240[dcl.name] C++11 constexpr defaulted constructors Unknown
1241[class.dtor] C++11 Which members does a destructor destroy? Unknown
1242[class.base.init] C++11 Initializing variant class members Unknown
1243[dcl.decl] C++11 Misleading footnote regarding multiple-declarator declarations Unknown
1244[temp.type] C++11 Equivalence of alias templates and class templates Unknown
1245[temp.mem.func] C++11 Matching declarations involving decltype Unknown
1246[temp.param] C++11 Non-deduced non-final parameter packs Unknown
1247[dcl.typedef] CD4 Restriction on alias name appearing in type-id Unknown
1248[diff.iso] open Updating Annex C to C99 and C23 Not resolved
1249[expr.prim.lambda.capture] CD6 Cv-qualification of nested lambda capture Unknown
1250[class.virtual] CD3 Cv-qualification of incomplete virtual function return types Clang 3.9
1251[diff.conv] CD3 C compatibility: casting to unqualified void* Unknown
1252[over.load] CD6 Overloading member function templates based on dependent return type Unknown
1253[temp.spec] C++17 Generic non-template members Unknown
1254[basic.def.odr] NAD odr-use vs template arguments and constexpr functions Unknown
1255[expr.const] drafting Definition problems with constexpr functions Not resolved
1256[expr.const] open Unevaluated operands are not necessarily constant expressions Not resolved
1257[temp.res] open Instantiation via non-dependent references in uninstantiated templates Not resolved
1258[temp.point] CD5 “Instantiation context” differs from dependent lookup rules Unknown
1259[expr.delete] NAD Deleting a POD via a pointer to base Unknown
1260[basic.def.odr] CD3 Incorrect use of term “overloaded” in description of odr-use Unknown
1261[expr] CD3 Explicit handling of cv-qualification with non-class prvalues Unknown
1262[temp.deduct] CD3 Default template arguments and deduction failure Unknown
1263[dcl.init.ref] NAD Mismatch between rvalue reference binding and overload resolution Unknown
1264[expr.const] CD3 Use of this in constexpr constructor Unknown
1265[dcl.spec.auto] CD3 Mixed use of the auto specifier Clang 5
1266[lex.ext] open user-defined-integer-literal overflow Not resolved
1267[except.spec] CD3 Rvalue reference types in exception-specifications Unknown
1268[expr.reinterpret.cast] CD3 reinterpret_cast of an xvalue operand Unknown
1269[expr.dynamic.cast] CD3 dynamic_cast of an xvalue operand Unknown
1270[dcl.init.list] CD3 Brace elision in array temporary initialization Unknown
1271[temp.res] CD5 Imprecise wording regarding dependent types Unknown
1272[class.static.data] NAD Implicit definition of static data member of const literal type Unknown
1273[temp.deduct] NAD Accessibility and function signatures Unknown
1274[stmt.ranged] CD4 Common nonterminal for expression and braced-init-list Unknown
1275[temp.param] CD3 Incorrect comment in example of template parameter pack restriction Unknown
1276[basic.fundamental] NAD Reference to stdint.h Unknown
1277[cstdint.syn] NAD Lax definition of intmax_t and uintmax_t Unknown
1278[over.call.func] drafting Incorrect treatment of contrived object Not resolved
1279[diff.cpp03] open Additional differences between C++ 2003 and C++ 2011 Not resolved
1280[basic.life] NAD Object reallocation and reference members Unknown
1281[temp.dep.type] NAD Virtual and dependent base classes Unknown
1282[except.spec] CD3 Underspecified destructor exception-specification Unknown
1283[class.static.data] open Static data members of classes with typedef name for linkage purposes Not resolved
1284[basic.life] CD4 Should the lifetime of an array be independent of that of its elements? Unknown
1285[basic.life] NAD Trivial destructors and object lifetime Unknown
1286[temp.alias] open Equivalence of alias templates Not resolved
1287[dcl.init.ref] C++14 Direct initialization vs “implicit” conversion in reference binding Unknown
1288[dcl.init.list] CD3 Reference list initialization Unknown
1289[temp.dep.type] NAD Can an alias template name the current instantiation? Unknown
1290[dcl.init.list] CD3 Lifetime of the underlying array of an initializer_list member Unknown
1291[basic.lookup.classref] CD6 Looking up a conversion-type-id N/A
1292[temp.dep] CD4 Dependent calls with braced-init-lists containing a pack expansion Unknown
1293[expr.const] CD3 String literals in constant expressions Unknown
1294[basic.start.static] open Side effects in dynamic/static initialization Not resolved
1295[dcl.init.ref] CD3 Binding a reference to an rvalue bit-field Clang 4
1296[temp.res] CD3 Ill-formed template declarations (not just definitions) Unknown
1297[dcl.decl] CD3 Misplaced function attribute-specifier Unknown
1298[over.ics.rank] CD3 Incorrect example in overload resolution Unknown
1299[class.temporary] CD5 “Temporary objects” vs “temporary expressions” Unknown
1300[expr.type.conv] dup T() for array types Unknown
1301[dcl.init] CD3 Value initialization of union Unknown
1302[basic.fundamental] CD3 noexcept applied to expression of type void Unknown
1303[temp] NAD C language linkage for template with internal linkage Unknown
1304[dcl.init.string] drafting Omitted array bound with string initialization Not resolved
1305[expr.alignof] CD3 alignof applied to array of unknown size Clang 3.0
1306[class.this] CD3 Modifying an object within a const member function Unknown
1307[over.ics.list] C++14 Overload resolution based on size of array initializer-list Clang 14
1308[class.mem] CD3 Completeness of class type within an exception-specification Superseded by 1330
1309[temp.dep.type] CD4 Incorrect note regarding lookup of a member of the current instantiation Unknown
1310[class.qual] CD3 What is an “acceptable lookup result?” Clang 5
1311[expr.const] CD3 Volatile lvalues in constant expressions Unknown
1312[expr.const] CD3 Simulated reinterpret_cast in constant expressions Unknown
1313[expr.const] CD3 Undefined pointer arithmetic in constant expressions Unknown
1314[expr.add] NAD Pointer arithmetic within standard-layout objects Unknown
1315[temp.spec.partial.general] CD4 Restrictions on non-type template arguments in partial specializations Partial
1316[dcl.constexpr] NAD constexpr function requirements and class scope Unknown
1317[dcl.enum] NAD Unnamed scoped enumerations Unknown
1318[class] CD3 Syntactic ambiguities with final Unknown
1319[temp.param] NAD Error in pack expansion example Unknown
1320[expr.static.cast] CD3 Converting scoped enumerations to bool Unknown
1321[temp.over.link] CD3 Equivalency of dependent calls Unknown
1322[temp.deduct] drafting Function parameter type decay in templates Not resolved
1323[dcl.attr.grammar] NAD Nonexistent nonterminal in alignment-specifier grammar Unknown
1324[dcl.init] CD3 Value initialization and defaulted constructors Unknown
1325[dcl.pre] NAD Omitted declarator in friend declarations Unknown
1326[temp.deduct.call] dup Deducing an array bound from an initializer-list Unknown
1327[dcl.fct.def.default] CD3 virt-specifier in a defaulted definition Unknown
1328[dcl.init.ref] CD3 Conflict in reference binding vs overload resolution Unknown
1329[implimits] CD3 Recursive deduction substitutions Unknown
1330[temp.deduct] CD3 Delayed instantiation of noexcept specifiers Clang 4 (C++11 onwards)
1331[dcl.fct.def.default] CD5 const mismatch with defaulted copy constructor Unknown
1332[lex.charset] CD5 Handling of invalid universal-character-names Unknown
1333[dcl.fct.def.default] CD3 Omission of const in a defaulted copy constructor Unknown
1334[basic.types] NAD Layout compatibility and cv-qualification Superseded by 1719
1335[cpp.stringize] CD6 Stringizing, extended characters, and universal-character-names Unknown
1336[class.conv.ctor] CD3 Definition of “converting constructor” Unknown
1337[temp.deduct.partial] dup Partial ordering and non-deduced parameters Unknown
1338[basic.stc.dynamic.allocation] CD4 Aliasing and allocation functions Unknown
1339[dcl.init] NAD Parenthesized braced-init-list and arrays Unknown
1340[expr.mptr.oper] CD3 Complete type in member pointer expressions Clang 2.9
1341[class.mem] NAD Bit-field initializers Superseded by P0683R1
1342[dcl.decl] CD6 Order of initialization with multiple declarators Unknown
1343[intro.execution] C++17 Sequencing of non-class initialization Unknown
1344[class.copy.ctor] C++14 Adding new special member functions to a class via default arguments Unknown
1345[class.base.init] CD3 Initialization of anonymous union class members Unknown
1346[dcl.spec.auto] CD3 expression-list initializers and the auto specifier Clang 3.5
1347[dcl.spec.auto] CD3 Consistency of auto in multiple-declarator declarations Clang 3.1
1348[dcl.spec.auto] drafting Use of auto in a trailing-return-type Not resolved
1349[temp.alias] dup Consistency of alias template redeclarations Unknown
1350[class.inhctor] CD3 Incorrect exception specification for inherited constructors Clang 3.5
1351[except.spec] CD4 Problems with implicitly-declared exception-specifications Unknown
1352[basic.scope.class] CD3 Inconsistent class scope and completeness rules Clang 3.0
1353[class.ctor] CD7 Array and variant members and deleted special member functions Unknown
1354[expr.unary.noexcept] CD3 Destructor exceptions for temporaries in noexcept expressions Unknown
1355[dcl.fct.def.default] CD3 Aggregates and “user-provided” constructors Unknown
1356[except.spec] CD4 Exception specifications of copy assignment operators with virtual bases Unknown
1357[class.mem] CD3 brace-or-equal-initializers for function and typedef members Unknown
1358[dcl.constexpr] CD3 Unintentionally ill-formed constexpr function template instances Clang 3.1
1359[dcl.constexpr] CD3 constexpr union constructors Clang 3.5
1360[class.ctor] CD6 constexpr defaulted default constructors Unknown
1361[basic.types] CD3 Requirement on brace-or-equal-initializers of literal types Unknown
1362[basic.def.odr] CD3 Complete type required for implicit conversion to T& Unknown
1363[class] CD3 Triviality vs multiple default constructors Unknown
1364[expr.const] CD3 constexpr function parameters Unknown
1365[expr.const] CD3 Calling undefined constexpr functions Unknown
1366[dcl.constexpr] CD3 Deleted constexpr constructors and virtual base classes Unknown
1367[expr.const] CD3 Use of this in a constant expression Unknown
1368[dcl.init] CD3 Value initialization and defaulted constructors (part 2) Unknown
1369[dcl.constexpr] CD3 Function invocation substitution of this Unknown
1370[cpp.replace] CD3 identifier-list cannot contain ellipsis Unknown
1371[temp.deduct.type] NAD Deduction from T&& in return types Unknown
1372[temp.deduct.conv] CD3 Cross-references incorrect in conversion function template argument deduction Unknown
1373[over.match.ref] dup Overload resolution changes matching reference-binding changes Unknown
1374[over.ics.rank] CD3 Qualification conversion vs difference in reference binding Unknown
1375[class.union] CD3 Reference to anonymous union? Unknown
1376[expr.static.cast] C++14 static_cast of temporary to rvalue reference Unknown
1377[diff.cpp03] dup Access declarations not mentioned in Annex C Unknown
1378[temp.inst] CD5 When is an instantiation required? Unknown
1379[dcl.init.list] NAD Is std::initializer_list an aggregate? Unknown
1380[dcl.fct] CD3 Type definitions in template-parameter parameter-declarations Unknown
1381[except.spec] CD3 Implicitly-declared special member functions and default nothrow Unknown
1382[dcl.decl] CD3 Dead code for constructor names Unknown
1383[expr] CD3 Clarifying discarded-value expressions Unknown
1384[expr.const] NAD reinterpret_cast in constant expressions Unknown
1385[over.match.oper] CD3 Syntactic forms of conversion functions for surrogate call functions Unknown
1386[temp.arg.explicit] NAD Explicitly-specified partial argument list with multiple parameter packs Unknown
1387[temp.deduct.type] CD3 Missing non-deduced context for decltype Unknown
1388[temp.deduct.call] CD3 Missing non-deduced context following a function parameter pack Clang 4
1389[dcl.fct] NAD Recursive reference in trailing-return-type Unknown
1390[temp.dep.type] drafting Dependency of alias template specializations Not resolved
1391[temp.arg.explicit] CD4 Conversions to parameter types with non-deduced template arguments Partial
1392[over.match.ref] CD3 Explicit conversion functions for references and non-references Unknown
1393[temp.variadic] C++17 Pack expansions in using-declarations Unknown
1394[dcl.fct] CD3 Incomplete types as parameters of deleted functions Clang 15
1395[temp.deduct.type] C++17 Partial ordering of variadic templates reconsidered Clang 16
1396[temp.inst] C++23 Deferred instantiation and checking of non-static data member initializers Unknown
1397[class.mem] CD4 Class completeness in non-static data member initializers Clang 3.2
1398[temp.arg.nontype] CD3 Non-type template parameters of type std::nullptr_t Unknown
1399[temp.deduct.call] CD3 Deduction with multiple function parameter packs Duplicate of 1388
1400[expr.eq] NAD Function pointer equality Unknown
1401[dcl.init.ref] CD3 Similar types and reference compatibility Unknown
1402[class.copy.ctor] CD3 Move functions too often deleted Unknown
1403[lex.comment] CD6 Universal-character-names in comments Unknown
1404[class.union] open Object reallocation in unions Not resolved
1405[basic.types] CD3 constexpr and mutable members of literal types Unknown
1406[temp.func.order] CD3 ref-qualifiers and added parameters of non-static member function templates Unknown
1407[expr.const] NAD Integral to bool conversion in converted constant expressions Unknown
1408[over.ics.rank] CD3 What is “the same aggregate initialization?” Unknown
1409[over.ics.list] CD3 What is the second standard conversion sequence of a list-initialization sequence? Unknown
1410[over.ics.rank] CD3 Reference overload tiebreakers should apply to rvalue references Unknown
1411[class] CD3 More on global scope :: in nested-name-specifier Unknown
1412[expr.static.cast] CD3 Problems in specifying pointer conversions Unknown
1413[temp.dep.constexpr] CD3 Missing cases of value-dependency Clang 12
1414[dcl.init.ref] drafting Binding an rvalue reference to a reference-unrelated lvalue Not resolved
1415[basic.link] CD3 Missing prohibition of block-scope definition of extern object Unknown
1416[expr.typeid] CD3 Function cv-qualifiers and typeid Unknown
1417[dcl.fct] C++14 Pointers/references to functions with cv-qualifiers or ref-qualifier Unknown
1418[dcl.init.list] CD3 Type of initializer_list backing array Unknown
1419[dcl.init.list] NAD Evaluation order in aggregate initialization Unknown
1420[class.abstract] NAD Abstract final classes Unknown
1421[dcl.init.list] NAD Full expressions and aggregate initialization Unknown
1422[lex.ccon] dup Type of character literals containing universal-character-names Unknown
1423[conv.fctptr] CD3 Convertibility of nullptr to bool Clang 11
1424[except.ctor] C++14 When must sub-object destructors be accessible? Unknown
1425[class.mem] CD3 Base-class subobjects of standard-layout structs N/A (ABI constraint)
1426[dcl.fct.def.default] CD5 Allowing additional parameter types in defaulted functions Unknown
1427[class.ctor] NAD Default constructor and deleted or inaccessible destructors Unknown
1428[basic.type.qualifier] CD3 Dynamic const objects Unknown
1429[basic.scope.temp] NAD Scope of a member template's template parameter Unknown
1430[temp.alias] open Pack expansion into fixed alias template parameter list Not resolved
1431[except] CD3 Exceptions from other than throw-expressions Unknown
1432[temp.variadic] open Newly-ambiguous variadic template expansions @@ -8419,912 +9809,1064 @@

C++ defect report implementation status

1433[basic.scope.pdecl] NAD trailing-return-type and point of declaration Unknown
1434[dcl.init] NAD Parenthesized braced-init-list Unknown
1435[dcl.meaning] CD3 template-id as the declarator for a class template constructor Unknown
1436[cpp.cond] open Interaction of constant expression changes with preprocessor expressions Not resolved
1437[dcl.typedef] CD3 alignas in alias-declaration Unknown
1438[basic.stc.dynamic.safety] CD3 Non-dereference use of invalid pointers Unknown
1439[namespace.memdef] CD3 Lookup and friend template declarations Unknown
1440[expr.prim.general] CD3 Acceptable decltype-specifiers used as nested-name-specifiers Unknown
1441[intro.execution] C++14 Unclear wording for signal handler restrictions Unknown
1442[stmt.ranged] CD3 Argument-dependent lookup in the range-based for Unknown
1443[dcl.fct.default] NAD Default arguments and non-static data members Clang 2.7
1444[temp.param] drafting Type adjustments of non-type template parameters Not resolved
1445[stmt.ranged] dup Argument-dependent lookup of begin and end Unknown
1446[temp.func.order] CD4 Member function with no ref-qualifier and non-member function with rvalue reference Unknown
1447[expr.static.cast] CD3 static_cast of bit-field lvalue to rvalue reference Unknown
1448[basic.fundamental] NAD Integral values of type bool Unknown
1449[dcl.init.list] CD3 Narrowing conversion of negative value to unsigned type Unknown
1450[expr.mul] CD3 INT_MIN % -1 Unknown
1451[temp.arg.nontype] CD4 Objects with no linkage in non-type template arguments Unknown
1452[expr.const] NAD Value-initialized objects may be constants Unknown
1453[basic.types] CD3 Volatile members in literal classes? Unknown
1454[expr.const] CD3 Passing constants through constexpr functions via references Unknown
1455[expr.const] CD3 Lvalue converted constant expressions Unknown
1456[expr.const] CD3 Address constant expression designating the one-past-the-end address Unknown
1457[expr.shift] CD3 Undefined behavior in left-shift Unknown
1458[expr.unary.op] CD3 Address of incomplete type vs operator&() Clang 3.1
1459[over.ics.rank] open Reference-binding tiebreakers in overload resolution Not resolved
1460[class.union] C++14 What is an empty union? Clang 3.5
1461[dcl.init.list] NAD Narrowing conversions to bit-fields Unknown
1462[temp.deduct] CD3 Deduction failure vs “ill-formed, no diagnostic required” Unknown
1463[temp.pre] drafting extern "C" alias templates Not resolved
1464[expr.new] CD3 Negative array bound in a new-expression Unknown
1465[expr.unary.noexcept] CD4 noexcept and std::bad_array_new_length Unknown
1466[intro.multithread] C++14 Visible sequences of side effects are redundant Unknown
1467[dcl.init.list] CD4 List-initialization of aggregate from same-type object Clang 3.7 (C++11 onwards)
1468[expr.prim.lambda.capture] CD5 typeid, overload resolution, and implicit lambda capture Unknown
1469[expr.new] CD5 Omitted bound in array new-expression Unknown
1470[intro.multithread] NAD Thread migration Unknown
1471[temp.dep.type] CD3 Nested type of non-dependent base Unknown
1472[basic.def.odr] CD3 odr-use of reference variables Unknown
1473[over.literal] CD3 Syntax of literal-operator-id Unknown
1474[lex.ext] NAD User-defined literals and <inttypes.h> format macros Unknown
1475[dcl.attr.depend] CD3 Errors in [[carries_dependency]] example Unknown
1476[intro.defs] CD3 Definition of user-defined type Unknown
1477[namespace.memdef] CD3 Definition of a friend outside its namespace Clang 2.7
1478[temp.names] CD6 template keyword for dependent template template arguments Unknown
1479[over.literal] CD3 Literal operators and default arguments Clang 3.1
1480[expr.const] CD3 Constant initialization via non-constant temporary Unknown
1481[over.inc] CD3 Increment/decrement operators with reference parameters Unknown
1482[basic.scope.pdecl] CD3 Point of declaration of enumeration Clang 3.0
1483[temp.res] NAD Non-dependent static_assert-declarations Unknown
1484[temp.inst] CD4 Unused local classes of function templates Unknown
1485[dcl.enum] drafting Out-of-class definition of member unscoped opaque enumeration Not resolved
1486[temp.deduct.funcaddr] drafting Base-derived conversion in member pointer deduction Not resolved
1487[class.inhctor] CD3 When are inheriting constructors declared? Clang 3.3
1488[dcl.name] drafting abstract-pack-declarators in type-ids Not resolved
1489[basic.start.static] CD3 Is value-initialization of an array constant initialization? Unknown
1490[dcl.init.list] CD4 List-initialization from a string literal Clang 3.7 (C++11 onwards)
1491[class.copy.ctor] CD3 Move construction and rvalue reference members Unknown
1492[class.dtor] CD4 Exception specifications on template destructors Unknown
1493[class.copy.ctor] C++14 Criteria for move-construction Unknown
1494[dcl.init.list] CD3 Temporary initialization for reference binding in list-initialization Unknown
1495[temp.spec.partial] CD3 Partial specialization of variadic class template Clang 4
1496[class.name] CD4 Triviality with deleted and missing default constructors No
1497[dcl.init.aggr] NAD Aggregate initialization with parenthesized string literal Unknown
1498[stmt.ranged] dup Lifetime of temporaries in range-based for Unknown
1499[class.copy.assign] CD7 Missing case for deleted move assignment operator Unknown
1500[temp.dep.candidate] CD6 Name lookup of dependent conversion function Unknown
1501[dcl.init.list] NAD Nested braces in list-initialization Unknown
1502[dcl.init] CD3 Value initialization of unions with member initializers Unknown
1503[except.throw] CD3 Exceptions during copy to exception object Unknown
1504[expr.add] CD3 Pointer arithmetic after derived-base conversion Unknown
1505[dcl.init.list] dup Direct binding of reference to temporary in list-initialization Unknown
1506[dcl.init.list] CD3 Value category of initializer_list object Unknown
1507[dcl.init] CD3 Value initialization with trivial inaccessible default constructor Unknown
1508[dcl.init.list] C++14 Template initializer-list constructors Unknown
1509[intro.defs] C++14 Definition of “non-template function” Unknown
1510[dcl.ref] CD3 cv-qualified references via decltype Unknown
1511[basic.def.odr] CD3 const volatile variables and the one-definition rule Unknown
1512[expr.rel] CD3 Pointer comparison vs qualification conversions Clang 4
1513[temp.deduct.call] drafting initializer_list deduction failure Not resolved
1514[class.bit] C++14 Ambiguity between enumeration definition and zero-length bit-field Clang 11
1515[basic.fundamental] CD3 Modulo 2n arithmetic for implicitly-unsigned types Unknown
1516[expr.call] CD3 Definition of “virtual function call” Unknown
1517[class.cdtor] open Unclear/missing description of behavior during construction/destruction Not resolved
1518[dcl.init.list] CD4 Explicit default constructors and copy-list-initialization Clang 4
1519[temp.variadic] NAD Conflicting default and variadic constructors Unknown
1520[temp.alias] NAD Alias template specialization vs pack expansion Unknown
1521[expr.type.conv] dup T{expr} with reference types Unknown
1522[dcl.init.list] CD3 Access checking for initializer_list array initialization Unknown
1523[stmt.ranged] CD5 Point of declaration in range-based for Unknown
1524[temp.dep.type] drafting Incompletely-defined class template base Not resolved
1525[expr.type.conv] NAD Array bound inference in temporary array Unknown
1526[temp.dep] dup Dependent-class lookup in the current instantiation Unknown
1527[expr.assign] CD3 Assignment from braced-init-list Unknown
1528[dcl.decl] CD3 Repeated cv-qualifiers in declarators Unknown
1529[basic.pre] drafting Nomenclature for variable vs reference non-static data member Not resolved
1530[basic.life] drafting Member access in out-of-lifetime objects Not resolved
1531[intro.defs] CD3 Definition of “access” (verb) Unknown
1532[temp.explicit] CD3 Explicit instantiation and member templates Unknown
1533[temp.variadic] CD3 Function pack expansion for member initialization Unknown
1534[basic.lval] dup cv-qualification of prvalue of type “array of class” Unknown
1535[expr.const] CD3 typeid in core constant expressions Unknown
1536[over.ics.list] drafting Overload resolution with temporary from initializer list Not resolved
1537[expr.const] CD3 Optional compile-time evaluation of constant expressions Unknown
1538[expr.assign] CD3 C-style cast in braced-init-list assignment Unknown
1539[basic.fundamental] CD3 Definition of “character type” Unknown
1540[expr.const] NAD Use of address constants in constant expressions Unknown
1541[stmt.return] CD3 cv void return types Unknown
1542[expr.assign] open Compound assignment of braced-init-list Not resolved
1543[over.ics.list] CD3 Implicit conversion sequence for empty initializer list Unknown
1544[dcl.stc] CD3 Linkage of member of unnamed namespace Unknown
1545[temp.friend] NAD friend function templates defined in class templates Unknown
1546[temp.deduct] NAD Errors in function template default arguments Unknown
1547[temp.res] NAD typename keyword in alias-declarations Unknown
1548[class.copy.ctor] open Copy/move construction and conversion functions Not resolved
1549[over.binary] open Overloaded comma operator with void operand Not resolved
1550[expr.cond] CD3 Parenthesized throw-expression operand of conditional-expression Clang 3.4
1551[namespace.udecl] C++14 Wording problems in using-declaration specification Unknown
1552[dcl.fct.def.default] CD4 exception-specifications and defaulted special member functions Unknown
1553[expr.sizeof] CD3 sizeof and xvalue bit-fields Unknown
1554[temp.alias] drafting Access and alias templates Not resolved
1555[expr.call] NAD Language linkage and function type compatibility Unknown
1556[over.match.copy] CD3 Constructors and explicit conversion functions in direct initialization Unknown
1557[expr.prim.lambda.closure] CD3 Language linkage of converted lambda function pointer Unknown
1558[temp.alias] CD4 Unused arguments in alias template specializations Clang 12
1559[expr.new] CD3 String too long in initializer list of new-expression Unknown
1560[expr.cond] CD3 Gratuitous lvalue-to-rvalue conversion in conditional-expression with throw-expression operand Clang 3.5
1561[dcl.init.aggr] CD4 Aggregates with empty base classes Unknown
1562[class.base.init] C++14 Non-static data member initializers and union ctor-initializer Unknown
1563[over.over] CD3 List-initialization and overloaded function disambiguation Clang 3.1
1564[dcl.spec.auto] NAD Template argument deduction from an initializer list Unknown
1565[dcl.init.list] NAD Copy elision and lifetime of initializer_list underlying array Unknown
1566[expr.new] NAD Should new std::initializer_list<T> be ill-formed? Unknown
1567[class.inhctor] C++14 Inheriting constructors and copy/move constructors Clang 3.3
1568[class.temporary] dup Temporary lifetime extension with intervening cast Unknown
1569[temp.deduct.type] C++14 Deducing a function parameter pack before ellipsis Unknown
1570[temp.arg.nontype] C++14 Address of subobject as non-type template argument Unknown
1571[dcl.init.ref] CD4 cv-qualification for indirect reference binding via conversion function Unknown
1572[dcl.init.ref] CD4 Incorrect example for rvalue reference binding via conversion function Unknown
1573[class.inhctor] CD4 Inherited constructor characteristics Clang 3.9
1574[dcl.fct.def.default] NAD Explicitly-defaulted constexpr functions in wrapper templates Unknown
1575[basic.stc.dynamic.safety] C++14 Incorrect definition of “strict pointer safety” Unknown
1576[expr] C++14 Discarded-value volatile xvalues Unknown
1577[temp.spec.partial.general] NAD Unnecessary restrictions on partial specializations Unknown
1578[dcl.init] NAD Value-initialization of aggregates Unknown
1579[class.copy.ctor] C++14 Return by converting move constructor Clang 3.9
1580[dcl.fct.default] drafting Default arguments in explicit instantiations Not resolved
1581[basic.def.odr] CD5 When are constexpr member functions defined? Unknown
1582[temp.deduct] drafting Template default arguments and deduction failure Not resolved
1583[intro.execution] C++14 Incorrect example of unspecified behavior Unknown
1584[temp.deduct.call] drafting Deducing function types from cv-qualified types @@ -9335,1836 +10877,2142 @@

C++ defect report implementation status

1585[expr.ref] NAD Value category of member access of rvalue reference member Unknown
1586[class.dtor] NAD Naming a destructor via decltype Unknown
1587[dcl.constexpr] C++14 constexpr initialization and nested anonymous unions Unknown
1588[dcl.spec.auto] CD3 Deducing cv-qualified auto Unknown
1589[over.ics.rank] CD4 Ambiguous ranking of list-initialization sequences Clang 3.7 (C++11 onwards)
1590[class.copy.ctor] CD4 Bypassing non-copy/move constructor copying Unknown
1591[temp.deduct.call] CD4 Deducing array bound and element type from initializer list Unknown
1592[temp.arg.template] C++14 When do template parameters match? Unknown
1593[class.copy.ctor] C++14 “Parameter type” of special member functions Unknown
1594[class.copy.ctor] drafting Lazy declaration of special members vs overload errors Not resolved
1595[dcl.constexpr] C++14 Constructors “involved in” subobject initialization Unknown
1596[expr.rel] CD4 Non-array objects as array[1] Unknown
1597[dcl.constexpr] CD3 Misleading constexpr example Unknown
1598[expr.eq] C++14 Criterion for equality of pointers to members Unknown
1599[dcl.init.list] CD4 Lifetime of initializer_list underlying array Unknown
1600[dcl.type.simple] CD4 Erroneous reference initialization in example Unknown
1601[conv.prom] C++14 Promotion of enumeration with fixed underlying type Clang 10
1602[temp.inst] review Linkage of specialization vs linkage of template arguments Not resolved
1603[basic.link] CD4 Errors resulting from giving unnamed namespaces internal linkage Unknown
1604[dcl.init.ref] C++14 Double temporaries in reference initialization Unknown
1605[class.dtor] CD3 Misleading parenthetical comment for explicit destructor call Unknown
1606[expr.sizeof] NAD sizeof closure class Clang 3.1
1607[expr.prim.lambda] C++14 Lambdas in template parameters Unknown
1608[over.match.oper] C++14 Operator lookup in trailing return type Unknown
1609[dcl.fct.default] open Default arguments and function parameter packs Not resolved
1610[temp.deduct.partial] drafting Cv-qualification in deduction of reference to array Not resolved
1611[class.ctor] C++14 Deleted default constructor for abstract class Duplicate of 1658
1612[expr.prim.lambda.capture] C++14 Implicit lambda capture and anonymous unions Unknown
1613[expr.prim.lambda.capture] C++14 Constant expressions and lambda capture Unknown
1614[basic.def.odr] CD4 Address of pure virtual function vs odr-use Unknown
1615[dcl.align] CD4 Alignment of types, variables, and members Unknown
1616[stmt.ambig] CD6 Disambiguation parsing and template parameters Unknown
1617[dcl.align] open alignas and non-defining declarations Not resolved
1618[dcl.enum] C++14 Gratuitously-unsigned underlying enum type Unknown
1619[temp.dep.type] open Definition of current instantiation Not resolved
1620[over.literal] open User-defined literals and extended integer types Not resolved
1621[class.base.init] C++20 Member initializers in anonymous unions Unknown
1622[dcl.init.aggr] C++17 Empty aggregate initializer for union Unknown
1623[class.ctor] drafting Deleted default union constructor and member initializers Not resolved
1624[except.ctor] NAD Destruction of union members with member initializers Unknown
1625[cpp.stringize] open Adding spaces between tokens in stringizing Not resolved
1626[expr.const] dup constexpr member functions in brace-or-equal-initializers Unknown
1627[dcl.align] NAD Agreement of dependent alignas specifiers Unknown
1628[expr.new] open Deallocation function templates Not resolved
1629[expr.prim.lambda.closure] C++14 Can a closure class be a literal type? Unknown
1630[dcl.init] CD4 Multiple default constructor templates Unknown
1631[over.ics.list] CD4 Incorrect overload resolution for single-element initializer-list Clang 3.7
1632[expr.prim.lambda.capture] CD5 Lambda capture in member initializers Unknown
1633[dcl.init] CD4 Copy-initialization in member initialization Unknown
1634[basic.stc] open Temporary storage duration Not resolved
1635[temp.param] drafting How similar are template default arguments to function default arguments? Not resolved
1636[dcl.enum] CD5 Bits required for negative enumerator values Unknown
1637[dcl.constexpr] NAD Recursion in constexpr template default constructor Unknown
1638[dcl.enum] CD4 Declaring an explicit specialization of a scoped enumeration Clang 3.1
1639[except.spec] CD4 exception-specifications and pointer/pointer-to-member expressions Unknown
1640[dcl.array] CD5 Array of abstract instance of class template Unknown
1641[class.base.init] NAD Assignment in member initializer Unknown
1642[expr.compound] CD7 Missing requirements for prvalue operands Unknown
1643[temp.param] NAD Default arguments for template parameter packs Unknown
1644[temp.over.link] NAD Equivalent exception-specifications in function template declarations Unknown
1645[class.inhctor] CD4 Identical inheriting constructors via default arguments Clang 3.9
1646[expr.call] CD5 decltype-specifiers, abstract classes, and deduction failure Unknown
1647[temp.spec.partial] drafting Type agreement of non-type template arguments in partial specializations Not resolved
1648[dcl.stc] C++14 thread_local vs block extern declarations Unknown
1649[class.base.init] C++14 Error in the syntax of mem-initializer-list Unknown
1650[dcl.init.ref] NAD Class prvalues in reference initialization Unknown
1651[class.temporary] NAD Lifetime extension of temporary via reference to subobject Unknown
1652[expr.eq] CD4 Object addresses in constexpr expressions Clang 3.6
1653[expr.pre.incr] CD4 Removing deprecated increment of bool Clang 4 (C++17 onwards)
1654[basic.types] dup Literal types and constexpr defaulted constructors Unknown
1655[lex.pptoken] open Line endings in raw string literals Not resolved
1656[lex.ccon] CD6 Encoding of numerically-escaped characters Unknown
1657[namespace.def] CD4 Attributes for namespaces and enumerators Unknown
1658[class.ctor] C++14 Deleted default constructor for abstract class via destructor Clang 5
1659[basic.start.static] open Initialization order of thread_local template static data members Not resolved
1660[class.mem] C++14 member-declaration requirements and unnamed bit-fields Unknown
1661[intro.multithread] NAD Preservation of infinite loops Unknown
1662[expr.prim.lambda.capture] C++14 Capturing function parameter packs Unknown
1663[expr.prim.lambda.capture] NAD Capturing an empty pack expansion Unknown
1664[expr.prim.lambda] C++14 Argument-dependent lookup of lambdas used in default arguments Unknown
1665[temp.explicit] drafting Declaration matching in explicit instantiations Not resolved
1666[temp.arg.nontype] C++14 Address constant expressions Unknown
1667[except.throw] NAD Function exiting via exception called by destructor during unwinding Unknown
1668[dcl.fct] drafting Parameter type determination still not clear enough Not resolved
1669[basic.start.main] C++14 auto return type for main Unknown
1670[dcl.spec.auto] review auto as conversion-type-id Not resolved
1671[temp.deduct.call] NAD Unclear rules for deduction with cv-qualification Unknown
1672[class.mem] CD4 Layout compatibility with multiple empty bases Clang 7
1673[over.best.ics] C++14 Clarifying overload resolution for the second step of copy-initialization Unknown
1674[dcl.spec.auto] C++14 Return type deduction for address of function Unknown
1675[implimits] NAD Size limit for automatic array object Unknown
1676[basic.stc.dynamic.allocation] drafting auto return type for allocation and deallocation functions Not resolved
1677[basic.start.static] C++17 Constant initialization via aggregate initialization Unknown
1678[expr.sizeof] NAD Naming the type of an array of runtime bound Unknown
1679[stmt.ranged] NAD Range-based for and array of runtime bound Unknown
1680[stmt.ranged] drafting Including <initializer_list> for range-based for Not resolved
1681[expr.prim.lambda.capture] C++14 init-captures and nested lambdas Unknown
1682[basic.stc.dynamic.allocation] open Overly-restrictive rules on function templates as allocation functions Not resolved
1683[expr.const] CD4 Incorrect example after constexpr changes Unknown
1684[dcl.constexpr] C++14 Static constexpr member functions for non-literal classes Clang 3.6
1685[expr.unary.noexcept] NAD Value category of noexcept expression Unknown
1686[basic.link] CD4 Which variables are “explicitly declared const?” Unknown
1687[over.match.oper] C++14 Conversions of operands of built-in operators Clang 7
1688[dcl.constexpr] NAD Volatile constexpr variables Unknown
1689[dcl.attr.grammar] C++14 Syntactic nonterminal for operand of alignas Unknown
1690[basic.lookup.argdep] C++14 Associated namespace for local type Clang 9
1691[basic.lookup.argdep] C++14 Argument-dependent lookup and opaque enumerations Clang 9
1692[basic.lookup.argdep] C++14 Associated namespaces of doubly-nested classes Clang 9
1693[class.mem] C++14 Superfluous semicolons in class definitions Unknown
1694[expr.const] CD4 Restriction on reference to temporary as a constant expression Unknown
1695[class.temporary] NAD Lifetime extension via init-capture Unknown
1696[class.temporary] CD4 Temporary lifetime and non-static data member initializers Clang 7
1697[class.temporary] CD4 Lifetime extension and copy elision Unknown
1698[lex.phases] CD7 Files ending in \ Unknown
1699[class.friend] open Does befriending a class befriend its friends? Not resolved
1700[temp.deduct.call] NAD Does the special rvalue-reference deduction apply to alias templates? Unknown
1701[basic.types] open Array vs sequence in object representation Not resolved
1702[class.union] drafting Rephrasing the definition of “anonymous union” Not resolved
1703[dcl.link] NAD Language linkage of names of functions with internal linkage Unknown
1704[temp.explicit] CD5 Type checking in explicit instantiation of variable templates Unknown
1705[temp.deduct.partial] CD4 Unclear specification of “more specialized” Unknown
1706[dcl.attr.grammar] drafting alignas pack expansion syntax Not resolved
1707[dcl.type.elab] C++14 template in elaborated-type-specifier without nested-name-specifier Unknown
1708[dcl.link] CD4 overly-strict requirements for names with C language linkage Unknown
1709[cpp.stringize] open Stringizing raw string literals containing newline Not resolved
1710[class.derived] C++17 Missing template keyword in class-or-decltype No
1711[temp.spec.partial] CD6 Missing specification of variable template partial specializations Unknown
1712[dcl.constexpr] CD4 constexpr variable template declarations Unknown
1713[dcl.link] dup Linkage of variable template specializations Unknown
1714[class.local] NAD odr-use of this from a local class Unknown
1715[class.inhctor] CD4 Access and inherited constructor templates Clang 3.9
1716[dcl.fct.default] C++14 When are default arguments evaluated? Unknown
1717[lex.icon] C++14 Missing specification of type of binary literal Unknown
1718[cpp.replace] open Macro invocation spanning end-of-file Not resolved
1719[class.mem] CD4 Layout compatibility and cv-qualification revisited Clang 19
1720[cpp.include] NAD Macro invocation in #include directive Unknown
1721[class.static.data] review Diagnosing ODR violations for static data members Not resolved
1722[expr.prim.lambda.closure] CD4 Should lambda to function pointer conversion function be noexcept? Clang 9
1723[lex.ext] open Multicharacter user-defined character literals Not resolved
1724[temp.deduct] CD6 Unclear rules for deduction failure Unknown
1725[dcl.spec.auto] NAD Trailing return type with nested function declarator Unknown
1726[class.conv.fct] CD6 Declarator operators and conversion function Unknown
1727[temp.expl.spec] NAD Type of a specialization of a variable template Unknown
1728[temp.explicit] CD5 Type of an explicit instantiation of a variable template Unknown
1729[temp.decls] CD6 Matching declarations and definitions of variable templates Unknown
1730[temp.decls] drafting Can a variable template have an unnamed type? Not resolved
1731[class.copy.ctor] NAD is_trivially_X and definitions of special member functions Unknown
1732[stmt.select] C++14 Defining types in conditions and range-based for statements Unknown
1733[dcl.fct.def.default] CD6 Return type and value for operator= with ref-qualifier Unknown
1734[class.copy.ctor] CD4 Nontrivial deleted copy functions No
1735[lex.ext] open Out-of-range literals in user-defined-literals Not resolved
1736[class.inhctor] CD4 Inheriting constructor templates in a local class Clang 3.9
1737[temp.dep.type] C++14 Type dependence of call to a member of the current instantiation Unknown
1738[class.inhctor] C++14 Explicit instantiation/specialization of inheriting constructor templates Superseded by P0136R1
1739[expr.static.cast] C++14 Conversion of floating point to enumeration Unknown
1740[except.spec] C++14 Disambiguation of noexcept Unknown
1741[basic.def.odr] C++14 odr-use of class object in lvalue-to-rvalue conversion Unknown
1742[namespace.udecl] CD5 using-declarations and scoped enumerators Unknown
1743[expr.prim.lambda.capture] NAD init-captures in nested lambdas Unknown
1744[basic.start.static] CD4 Unordered initialization for variable template specializations Unknown
1745[dcl.constexpr] NAD thread_local constexpr variable Unknown
1746[basic.types] C++14 Are volatile scalar types trivially copyable? Unknown
1747[basic.start.static] C++14 Constant initialization of reference to function Unknown
1748[expr.new] CD4 Placement new with a null pointer Clang 3.7
1749[basic.start.static] NAD Confusing definition for constant initializer Unknown
1750[over.match.copy] CD4 “Argument” vs “parameter” Unknown
1751[basic.life] CD4 Non-trivial operations vs non-trivial initialization Unknown
1752[class.base.init] CD4 Right-recursion in mem-initializer-list Unknown
1753[basic.lookup.qual] CD4 decltype-specifier in nested-name-specifier of destructor Clang 11
1754[temp.spec.partial] NAD Declaration of partial specialization of static data member template Unknown
1755[temp.spec.partial.member] drafting Out-of-class partial specializations of member templates Not resolved
1756[dcl.init.list] CD4 Direct-list-initialization of a non-class object Clang 3.7
1757[expr.const] CD4 Const integral subobjects Unknown
1758[over.match.list] CD4 Explicit conversion in copy/move list initialization Clang 3.7
1759[lex.string] C++14 UTF-8 code units in plain char Unknown
1760[expr.prim.lambda.capture] C++14 Access of member corresponding to init-capture Unknown
1761[dcl.array] NAD Runtime check on size of automatic array Unknown
1762[over.literal] C++14 Reserved identifier used in literal-operator-id example Clang 14
1763[temp.deduct.type] open Length mismatch in template type deduction Not resolved
1764[class.member.lookup] C++14 Hiding of function from using-declaration by signature Unknown
1765[dcl.enum] C++14 Overflow of enumeration used as enumerator value Unknown
1766[dcl.enum] CD4 Values outside the range of the values of an enumeration Unknown
1767[stmt.switch] C++14 Scoped enumeration in a switch statement Unknown
1768[dcl.array] NAD Zero-element array of runtime bound Unknown
1769[except.handle] C++14 Catching a base class of the exception object Unknown
1770[temp.deduct.type] C++14 Type matching of non-type template parameters and arguments Unknown
1771[basic.lookup.qual] CD6 Restricted lookup in nested-name-specifier Unknown
1772[expr.prim.lambda] C++14 __func__ in a lambda body Clang 14
1773[conv.lval] C++14 Out-of-lifetime lvalue-to-rvalue conversion Unknown
1774[except.ctor] CD4 Discrepancy between subobject destruction and stack unwinding Unknown
1775[lex.phases] C++14 Undefined behavior of line splice in raw string literal Unknown
1776[basic.life] CD4 Replacement of class objects containing reference members Unknown
1777[except.spec] CD4 Empty pack expansion in dynamic-exception-specification Unknown
1778[dcl.fct.def.default] C++14 exception-specification in explicitly-defaulted functions Clang 9
1779[temp.dep.expr] CD4 Type dependency of __func__ Clang 14
1780[expr.prim.lambda.closure] CD4 Explicit instantiation/specialization of generic lambda operator() Unknown
1781[over.match.conv] CD5 Converting from nullptr_t to bool in overload resolution Unknown
1782[dcl.init] CD4 Form of initialization for nullptr_t to bool conversion Unknown
1783[class.dtor] NAD Why are virtual destructors non-trivial? Unknown
1784[stmt.dcl] C++17 Concurrent execution during static local initialization Unknown
1785[temp.res] NAD Conflicting diagnostic requirements for template definitions Unknown
1786[expr.new] C++14 Effect of merging allocations on memory leakage Unknown
1787[conv.lval] C++14 Uninitialized unsigned char values Unknown
1788[expr.delete] CD4 Sized deallocation of array of non-class type Unknown
1789[over.ics.rank] open Array reference vs array decay in overload resolution Not resolved
1790[dcl.fct] open Ellipsis following function parameter pack Not resolved
1791[dcl.fct.def.general] CD4 Incorrect restrictions on cv-qualifier-seq and ref-qualifier Unknown
1792[temp.expl.spec] NAD Incorrect example of explicit specialization of member enumeration Unknown
1793[dcl.stc] CD4 thread_local in explicit specializations Unknown
1794[temp.names] C++17 template keyword and alias templates Clang 2.7
1795[namespace.def] CD4 Disambiguating original-namespace-definition and extension-namespace-definition Unknown
1796[lex.charset] CD4 Is all-bits-zero for null characters a meaningful requirement? Unknown
1797[basic.fundamental] CD4 Are all bit patterns of unsigned char distinct numbers? Unknown
1798[except.spec] NAD exception-specifications of template arguments Unknown
1799[dcl.stc] CD4 mutable and non-explicit const qualification Unknown
1800[expr.unary.op] CD4 Pointer to member of nested anonymous union Clang 2.9
1801[class.union] CD4 Kind of expression referring to member of anonymous union Clang 2.8
1802[lex.string] CD4 char16_t string literals and surrogate pairs Clang 3.1
1803[class.mem] CD5 opaque-enum-declaration as member-declaration Clang 2.9
1804[temp.friend] CD4 Partial specialization and friendship Clang 2.7
1805[expr.cond] CD4 Conversions of array operands in conditional-expressions Unknown
1806[class.copy.assign] CD4 Virtual bases and move-assignment Unknown
1807[except.ctor] CD4 Order of destruction of array elements after an exception Clang 3.0
1808[class.ctor] drafting Constructor templates vs default constructors Not resolved
1809[temp.deduct] CD4 Narrowing and template argument deduction Unknown
1810[lex.ext] CD4 Invalid ud-suffixes Unknown
1811[class.dtor] CD4 Lookup of deallocation function in a virtual destructor definition Unknown
1812[temp.names] C++17 Omission of template in a typename-specifier No
1813[class] CD4 Direct vs indirect bases in standard-layout classes Clang 7
1814[dcl.fct.default] CD4 Default arguments in lambda-expressions Clang 3.1
1815[dcl.init.aggr] CD4 Lifetime extension in aggregate initialization Clang 20
1816[conv.integral] CD4 Unclear specification of bit-field values Unknown
1817[dcl.link] open Linkage specifications and nested scopes Not resolved
1818[dcl.link] CD6 Visibility and inherited language linkage Clang 3.4
1819[temp.spec.partial.general] CD4 Acceptable scopes for definition of partial specialization Unknown
1820[dcl.typedef] CD6 Qualified typedef names Clang 3.5
1821[class.mem] CD6 Qualified redeclarations in a class member-specification Clang 2.9
1822[expr.prim.lambda] CD6 Lookup of parameter names in lambda-expressions Clang 3.1
1823[dcl.fct.spec] CD4 String literal uniqueness in inline functions Unknown
1824[dcl.fct] CD4 Completeness of return type vs point of instantiation Clang 2.7
1825[temp.deduct.partial] C++17 Partial ordering between variadic and non-variadic function templates Unknown
1826[expr.const] NAD const floating-point in constant expressions Unknown
1827[dcl.init.ref] drafting Reference binding with ambiguous conversions Not resolved
1828[basic.lookup.qual] CD6 nested-name-specifier ambiguity Unknown
1829[temp.dep.type] CD6 Dependent unnamed types Unknown
1830[dcl.pre] CD4 Repeated specifiers Unknown
1831[class.copy.ctor] NAD Explicitly vs implicitly deleted move constructors Unknown
1832[expr.static.cast] CD4 Casting to incomplete enumeration Clang 3.0
1833[class.friend] NAD friend declarations naming implicitly-declared member functions Unknown
1834[basic.start.static] CD4 Constant initialization binding a reference to an xvalue Unknown
1835[basic.lookup.classref] CD6 Dependent member lookup before < Unknown
1836[expr.prim.general] CD5 Use of class type being defined in trailing-return-type Unknown
1837[expr.prim.general] CD6 Use of this in friend and local class declarations Clang 3.3
1838[namespace.memdef] CD4 Definition via unqualified-id and using-declaration Unknown
1839[basic.link] CD6 Lookup of block-scope extern declarations Unknown
1840[temp.expl.spec] drafting Non-deleted explicit specialization of deleted function template Not resolved
1841[temp.local] CD6 < following template injected-class-name Unknown
1842[intro.multithread] open Unevaluated operands and “carries a dependency” Not resolved
1843[expr.cond] CD4 Bit-field in conditional operator with throw operand Unknown
1844[temp.deduct] open Defining “immediate context” Not resolved
1845[temp.point] review Point of instantiation of a variable template specialization Not resolved
1846[dcl.fct.def.default] CD4 Declaring explicitly-defaulted implicitly-deleted functions Unknown
1847[temp.deduct.type] CD4 Clarifying compatibility during partial ordering Unknown
1848[class.dtor] CD4 Parenthesized constructor and destructor declarators Unknown
1849[basic.def.odr] CD6 Variable templates and the ODR Unknown
1850[temp.res] CD4 Differences between definition context and point of instantiation Unknown
1851[expr.new] CD4 decltype(auto) in new-expressions Unknown
1852[dcl.type.simple] CD4 Wording issues regarding decltype(auto) Unknown
1853[basic.life] dup Defining “allocated storage” Unknown
1854[dcl.fct.def.default] drafting Disallowing use of implicitly-deleted functions Not resolved
1855[class.cdtor] dup Out-of-lifetime access to nonstatic data members Unknown
1856[temp.inst] open Indirect nested classes of class templates Not resolved
1857[expr.shift] CD5 Additional questions about bits Unknown
1858[expr.eq] CD4 Comparing pointers to union members Unknown
1859[lex.string] CD5 UTF-16 in char16_t string literals Unknown
1860[class.union] C++17 What is a “direct member?” Unknown
1861[class.bit] CD4 Values of a bit-field Unknown
1862[temp.friend] CD5 Determining “corresponding members” for friendship No
1863[except.throw] CD4 Requirements on thrown object type to support std::current_exception() Unknown
1864[dcl.init.list] NAD List-initialization of array objects Unknown
1865[expr.add] CD4 Pointer arithmetic and multi-level qualification conversions Unknown
1866[except.ctor] CD4 Initializing variant members with non-trivial destructors Unknown
1867[dcl.ambig.res] NAD Function/expression ambiguity with qualified parameter name Unknown
1868[dcl.spec.auto] open Meaning of “placeholder type” Not resolved
1869[dcl.link] NAD thread_local vs linkage-specifications Unknown
1870[basic.def] CD4 Contradictory wording about definitions vs explicit specialization/instantiation Unknown
1871[lex.ext] NAD Non-identifier characters in ud-suffix Unknown
1872[dcl.constexpr] CD4 Instantiations of constexpr templates that cannot appear in constant expressions Clang 9
1873[class.access.base] CD4 Protected member access from derived class friends Unknown
1874[temp.param] CD4 Type vs non-type template parameters with class keyword Unknown
1875[basic.scope.class] CD4 Reordering declarations in class scope Unknown
1876[temp.expl.spec] NAD Preventing explicit specialization Unknown
1877[dcl.spec.auto] CD4 Return type deduction from return with no operand Unknown
1878[dcl.spec.auto] CD4 operator auto template Clang 18
1879[basic.align] NAD Inadequate definition of alignment requirement Unknown
1880[expr.call] CD4 When are parameter objects destroyed? Unknown
1881[class] CD4 Standard-layout classes and unnamed bit-fields Clang 7
1882[global.names] CD4 Reserved names without library use Unknown
1883[class.protected] review Protected access to constructors in mem-initializers Not resolved
1884[basic.link] CD6 Unclear requirements for same-named external-linkage entities Partial
1885[expr.call] CD4 Return value of a function is underspecified Unknown
1886[basic.start.main] CD4 Language linkage for main() Unknown
1887[namespace.udecl] CD4 Problems with :: as nested-name-specifier Unknown
1888[class.ctor] CD4 Implicitly-declared default constructors and explicit Unknown
1889[cpp.pragma] open Unclear effect of #pragma on conformance Not resolved
1890[class.mem] drafting Member type depending on definition of member function @@ -11175,2670 +13023,3115 @@

C++ defect report implementation status

1891[expr.prim.lambda.closure] CD4 Move constructor/assignment for closure class Clang 4
1892[dcl.spec.auto] CD4 Use of auto in function type Unknown
1893[expr.type.conv] CD5 Function-style cast with braced-init-lists and empty pack expansions Unknown
1894[dcl.typedef] CD6 typedef-names and using-declarations Clang 3.8
1895[expr.cond] CD4 Deleted conversions in conditional operator operands Unknown
1896[temp.alias] CD6 Repeated alias templates Unknown
1897[basic.def.odr] review ODR vs alternative tokens Not resolved
1898[over.dcl] CD6 Use of “equivalent” in overload resolution Clang 2.7
1899[temp.dep.constexpr] CD4 Value-dependent constant expressions Unknown
1900[dcl.meaning] CD6 Do friend declarations count as “previous declarations”? Clang 2.7
1901[lex.token] open punctuator referenced but not defined Not resolved
1902[over.best.ics] CD4 What makes a conversion “otherwise ill-formed”? Clang 3.7
1903[namespace.udecl] CD4 What declarations are introduced by a non-member using-declaration? Clang 2.7
1904[temp.param] NAD Default template arguments for members of class templates Unknown
1905[temp.dep.type] NAD Dependent types and injected-class-names Unknown
1906[basic.lookup.unqual] NAD Name lookup in member friend declaration Unknown
1907[namespace.udecl] CD6 using-declarations and default arguments Unknown
1908[basic.lookup.classref] CD6 Dual destructor lookup and template-ids Unknown
1909[class.mem] CD4 Member class template with the same name as the class Clang 3.7
1910[basic.stc.dynamic.allocation] CD5 “Shall” requirement applied to runtime behavior Unknown
1911[dcl.constexpr] CD4 constexpr constructor with non-literal base class Unknown
1912[dcl.fct.def.default] CD5 exception-specification of defaulted function Unknown
1913[expr.prim.lambda] CD5 decltype((x)) in lambda-expressions Unknown
1914[dcl.attr] extension Duplicate standard attributes Extension
1915[class.base.init] open Potentially-invoked destructors in non-throwing constructors Not resolved
1916[class.copy.ctor] CD4 “Same cv-unqualified type” Unknown
1917[dcl.enum] NAD decltype-qualified enumeration names Unknown
1918[temp.friend] CD5 friend templates with dependent scopes No
1919[over.match.oper] open Overload resolution for ! with explicit conversion operator Not resolved
1920[expr.pseudo] CD4 Qualification mismatch in pseudo-destructor-name Unknown
1921[expr.const] NAD constexpr constructors and point of initialization of const variables Unknown
1922[temp.local] CD4 Injected class template names and default arguments Unknown
1923[expr.unary.op] NAD Lvalues of type void Unknown
1924[lex.literal] review Definition of “literal” and kinds of literals Not resolved
1925[expr.comma] CD4 Bit-field prvalues Unknown
1926[basic.def.odr] CD4 Potential results of subscript operator Unknown
1927[expr.prim.lambda.capture] dup Lifetime of temporaries in init-captures Unknown
1928[class.copy.ctor] NAD Triviality of deleted special member functions Unknown
1929[expr.prim.general] CD4 template keyword following namespace nested-name-specifier Unknown
1930[dcl.stc] CD4 init-declarator-list vs member-declarator-list Unknown
1931[expr.prim.lambda.closure] CD5 Default-constructible and copy-assignable closure types Unknown
1932[expr.cond] CD4 Bit-field results of conditional operators Unknown
1933[implimits] NAD Implementation limit for initializer-list elements Unknown
1934[except.spec] NAD Relaxing exception-specification compatibility requirements Unknown
1935[expr.new] CD5 Reuse of placement arguments in deallocation Unknown
1936[temp.dep] CD6 Dependent qualified-ids Unknown
1937[expr.prim.lambda.closure] CD5 Incomplete specification of function pointer from lambda Unknown
1938[intro.compliance] CD5 Should hosted/freestanding be implementation-defined? Unknown
1939[temp.deduct.call] open Argument conversions to nondeduced parameter types revisited Not resolved
1940[class.union] CD4 static_assert in anonymous unions Clang 3.5
1941[class.inhctor] CD4 SFINAE and inherited constructor default arguments Clang 3.9
1942[expr.prim.lambda] CD4 Incorrect reference to trailing-return-type Unknown
1943[class.bit] CD5 Unspecified meaning of “bit” Unknown
1944[diff] open New C incompatibilities Not resolved
1945[temp.friend] CD5 Friend declarations naming members of class templates in non-templates No
1946[except.spec] CD4 exception-specifications vs pointer dereference Unknown
1947[lex.icon] NAD Digit separators following non-octal prefix Clang 3.5
1948[basic.stc.dynamic] NAD exception-specification of replacement global new Clang 3.5
1949[intro.execution] CD4 “sequenced after” instead of “sequenced before” Unknown
1950[over.ics.rank] NAD Restructuring description of ranks of conversion sequences Unknown
1951[basic.types] CD4 Cv-qualification and literal types Unknown
1952[expr.const] CD4 Constant expressions and library undefined behavior Unknown
1953[intro.memory] CD7 Data races and common initial sequence Unknown
1954[expr.typeid] CD7 typeid null dereference check in subexpressions Unknown
1955[cpp.cond] CD4 #elif with invalid controlling expression Unknown
1956[basic.stc.auto] CD4 Reuse of storage of automatic variables Unknown
1957[dcl.spec.auto] NAD decltype(auto) with direct-list-initialization Unknown
1958[dcl.spec.auto] CD4 decltype(auto) with parenthesized initializer Unknown
1959[class.inhctor] CD4 Inadvertently inherited copy constructor Clang 3.9
1960[namespace.udecl] NAD Visibility of entity named in class-scope using-declaration No
1961[intro.multithread] C++17 Potentially-concurrent actions within a signal handler Unknown
1962[dcl.fct.def.general] open Type of __func__ Not resolved
1963[lex.name] CD4 Implementation-defined identifier characters Unknown
1964[dcl.typedef] NAD opaque-enum-declaration in alias-declaration? Unknown
1965[expr.dynamic.cast] CD7 Explicit casts to reference types Unknown
1966[dcl.enum] CD4 Colon following enumeration elaborated-type-specifier Clang 11
1967[class.copy.elision] CD4 Temporary lifetime and move-elision Unknown
1968[expr.const] NAD Address of typeid in constant expressions No
1969[class.dtor] CD6 Missing exclusion of ~S as an ordinary function name Unknown
1970[dcl.ambig.res] NAD Ambiguity resolution for (T())*x Unknown
1971[expr.unary.op] CD4 Unclear disambiguation of destructor and operator~ Unknown
1972[lex.name] CD6 Identifier character restrictions in non-identifiers Unknown
1973[expr.prim.lambda.closure] CD7 Which parameter-declaration-clause in a lambda-expression? Unknown
1974[temp.res] NAD Redundant specification of non-type typename-specifier Unknown
1975[except.spec] CD4 Permissible declarations for exception-specifications Unknown
1976[namespace.alias] NAD Ambiguity of namespace-aliases Unknown
1977[class.dtor] open Contradictory results of failed destructor lookup Not resolved
1978[class.conv.ctor] CD4 Redundant description of explicit constructor use Unknown
1979[temp.alias] drafting Alias template specialization in template member definition Not resolved
1980[temp.alias] drafting Equivalent but not functionally-equivalent redeclarations Not resolved
1981[conv] CD4 Implicit contextual conversions and explicit Unknown
1982[temp.arg.explicit] NAD Deduction extending parameter pack Unknown
1983[class.mem] CD5 Inappropriate use of virt-specifier Unknown
1984[dcl.init.list] NAD Lossless narrowing conversions Unknown
1985[dcl.init.aggr] NAD Unknown bound array member with brace-or-equal-initializer Unknown
1986[basic.start.static] drafting odr-use and delayed initialization Not resolved
1987[class.static.data] NAD constexpr static data members across translation units Unknown
1988[temp.dep.type] CD4 Ambiguity between dependent and non-dependent bases in implicit member access Unknown
1989[over.oper] drafting Insufficient restrictions on parameters of postfix operators Not resolved
1990[dcl.pre] CD4 Ambiguity due to optional decl-specifier-seq Unknown
1991[class.inhctor] CD4 Inheriting constructors vs default arguments Clang 3.9
1992[expr.new] CD4 new (std::nothrow) int[N] can throw Unknown
1993[temp.expl.spec] open Use of template<> defining member of explicit specialization Not resolved
1994[temp.expl.spec] dup Confusing wording regarding multiple template<> prefixes Duplicate of 529
1995[except.spec] CD4 exception-specifications and non-type template parameters Unknown
1996[dcl.init.list] drafting Reference list-initialization ignores conversion functions Not resolved
1997[basic.indet] CD7 Placement new and previous initialization Unknown
1998[basic.lval] NAD Additional sources of xvalue expressions Unknown
1999[lex.phases] CD4 Representation of source characters as universal-character-names Unknown
2000[lex.pptoken] CD4 header-name outside #include directive Unknown
2001[cpp.pre] CD4 non-directive is underspecified Unknown
2002[cpp.pre] open White space within preprocessing directives Not resolved
2003[cpp.replace] drafting Zero-argument macros incorrectly specified Not resolved
2004[expr.const] CD4 Unions with mutable members in constant expressions Unknown
2005[expr.const] NAD Incorrect constexpr reference initialization requirements Unknown
2006[basic.compound] CD4 Cv-qualified void types Unknown
2007[over.match.oper] CD6 Argument-dependent lookup for operator= Clang 3.4
2008[temp.arg] CD4 Default template-arguments underspecified Unknown
2009[basic.scope.class] CD6 Unclear specification of class scope N/A
2010[except.spec] CD4 exception-specifications and conversion operators Unknown
2011[expr.prim.lambda.capture] C++17 Unclear effect of reference capture of reference Unknown
2012[basic.stc] CD4 Lifetime of references Unknown
2013[expr.add] drafting Pointer subtraction in large array Not resolved
2014[new.delete.array] NAD Unneeded deallocation signatures Unknown
2015[dcl.fct.def.delete] CD4 odr-use of deleted virtual functions Unknown
2016[class.conv.fct] CD4 Confusing wording in description of conversion function Unknown
2017[stmt.return] CD4 Flowing off end is not equivalent to no-expression return Unknown
2018[dcl.init.ref] dup Qualification conversion vs reference binding Unknown
2019[basic.stc.general] CD4 Member references omitted from description of storage duration Unknown
2020[basic.def.odr] CD5 Inadequate description of odr-use of implicitly-invoked functions Unknown
2021[temp.over.link] dup Function template redeclaration via alias template Unknown
2022[expr.const] CD4 Copy elision in constant expressions Unknown
2023[expr.cond] drafting Composite reference result type of conditional operator Not resolved
2024[temp.dep.type] CD4 Dependent types and unexpanded parameter packs Unknown
2025[temp.over.link] dup Declaration matching via alias templates Unknown
2026[basic.start] CD4 Zero-initialization and constexpr Clang 11
2027[dcl.align] CD4 Unclear requirements for multiple alignas specifiers Unknown
2028[over.match.ref] drafting Converting constructors in rvalue reference initialization Not resolved
2029[expr.call] dup Abstract class return type in decltype operand Unknown
2030[class.access.base] NAD Access of injected-class-name with template arguments Unknown
2031[diff.cpp03.expr] CD4 Missing incompatibility for && Unknown
2032[temp.param] CD4 Default template-arguments of variable templates Unknown
2033[temp.spec.partial.general] CD4 Redundant restriction on partial specialization argument Unknown
2034[except.uncaught] NAD Deprecating uncaught_exception() Unknown
2035[temp.spec.partial.match] CD3 Multi-section example is confusing Unknown
2036[dcl.decl] NAD Refactoring parameters-and-qualifiers Unknown
2037[temp.type] drafting Alias templates and template declaration matching Not resolved
2038[diff.cpp14] CD4 Document C++14 incompatibility of new braced deduction rule Unknown
2039[except.spec] CD4 Constant conversions to bool Unknown
2040[dcl.decl] CD4 trailing-return-type no longer ambiguous Unknown
2041[temp.expl.spec] CD4 Namespace for explicit class template specialization Unknown
2042[basic.stc.dynamic.deallocation] review Exceptions and deallocation functions Not resolved
2043[temp.arg.nontype] drafting Generalized template arguments and array-to-pointer decay Not resolved
2044[dcl.spec.auto] CD4 decltype(auto) and void Unknown
2045[temp.over.link] CD5 “Identical” template parameter lists Unknown
2046[intro.multithread] C++17 Incomplete thread specifications Unknown
2047[except.spec] CD4 Coordinating “throws anything” specifications Unknown
2048[expr.static.cast] open C-style casts that cast away constness vs static_cast Not resolved
2049[temp.arg.nontype] CD7 List initializer in non-type template default argument Clang 18
2050[dcl.stc] NAD Consolidate specification of linkage Unknown
2051[basic.lval] CD5 Simplifying alias rules Unknown
2052[over.oper] CD4 Template argument deduction vs overloaded operators Unknown
2053[dcl.spec.auto] C++20 auto in non-generic lambdas Unknown
2054[temp.deduct] CD7 Missing description of class SFINAE Unknown
2055[temp.arg.explicit] drafting Explicitly-specified non-deduced parameter packs Not resolved
2056[class.base.init] open Member function calls in partially-initialized class objects Not resolved
2057[temp.arg.template] drafting Template template arguments with default arguments Not resolved
2058[basic.link] CD6 More errors from internal-linkage namespaces Unknown
2059[dcl.spec.auto] CD5 Linkage and deduced return types Unknown
2060[dcl.spec.auto] NAD Deduced return type for explicit specialization Unknown
2061[namespace.def] CD4 Inline namespace after simplifications Clang 2.7
2062[temp.class] CD6 Class template redeclaration requirements Unknown
2063[basic.scope.declarative] CD4 Type/nontype hiding in class scope Unknown
2064[temp.type] CD4 Conflicting specifications for dependent decltype-specifiers Unknown
2065[temp.dep.type] CD6 Current instantiation of a partial specialization Unknown
2066[temp.dep.constexpr] CD4 Does type-dependent imply value-dependent? Unknown
2067[temp.res] open Generated variadic templates requiring empty pack Not resolved
2068[class.dtor] CD4 When can/must a defaulted virtual destructor be defined? Unknown
2069[class.dtor] CD4 Do destructors have names? Unknown
2070[class.qual] CD6 using-declaration with dependent nested-name-specifier Unknown
2071[dcl.typedef] CD4 typedef with no declarator Unknown
2072[temp.inst] C++23 Default argument instantiation for member functions of templates Unknown
2073[basic.stc.dynamic.allocation] open Allocating memory for exception objects Not resolved
2074[temp.dep.type] drafting Type-dependence of local class of function template Not resolved
2075[over.ics.list] CD4 Passing short initializer lists to array reference parameters Unknown
2076[over.best.ics] CD4 List-initialization of arguments for constructor parameters Clang 13
2077[over.ics.ref] drafting Overload resolution and invalid rvalue-reference initialization Not resolved
2078[class.member.lookup] NAD Name lookup of mem-initilizer-id Unknown
2079[dcl.attr.grammar] CD4 [[ appearing in a balanced-token-seq Unknown
2080[class.union] CD5 Example with empty anonymous union member Unknown
2081[dcl.spec.auto] CD5 Deduced return type in redeclaration or specialization of function template Unknown
2082[dcl.fct.default] CD4 Referring to parameters in unevaluated operands of default arguments Clang 11
2083[basic.def.odr] CD5 Incorrect cases of odr-use Partial
2084[class.ctor] CD4 NSDMIs and deleted union default constructors Clang 3.1
2085[basic.def.odr] CD4 Invalid example of adding special member function via default argument Unknown
2086[expr.prim.lambda.capture] drafting Reference odr-use vs implicit capture Not resolved
2087[expr.shift] NAD Left shift of negative value by zero bits Unknown
2088[temp.deduct.partial] CD5 Late tiebreakers in partial ordering Unknown
2089[over.match.oper] drafting Restricting selection of builtin overloaded operators Not resolved
2090[temp.dep.temp] open Dependency via non-dependent base class Not resolved
2091[temp.deduct.type] CD4 Deducing reference non-type template arguments Clang 10
2092[temp.over] CD5 Deduction failure and overload resolution Unknown
2093[except.handle] CD4 Qualification conversion for pointer-to-member handler matching Unknown
2094[class.copy.ctor] C++17 Trivial copy/move constructor for class with volatile member Clang 5
2095[expr.prim.lambda.capture] CD4 Capturing rvalue references to functions by copy Unknown
2096[basic.types] CD4 Constraints on literal unions Duplicate of 2598
2097[expr.prim.lambda] extension Lambdas and noreturn attribute Extension
2098[except.uncaught] CD4 Is uncaught_exceptions() per-thread? Unknown
2099[dcl.array] CD4 Inferring the bound of an array static data member Unknown
2100[temp.dep.constexpr] C++17 Value-dependent address of static data member of class template Clang 12
2101[temp.dep] CD4 Incorrect description of type- and value-dependence Unknown
2102[expr.new] CD7 Constructor checking in new-expression Unknown
2103[basic.def.odr] CD5 Lvalue-to-rvalue conversion is irrelevant in odr-use of a reference Clang 2.7
2104[basic.def.odr] CD4 Internal-linkage constexpr references and ODR requirements Unknown
2105[temp.arg] open When do the arguments for a parameter pack end? Not resolved
2106[temp.arg.type] CD4 Unclear restrictions on use of function-type template arguments Unknown
2107[class.temporary] CD4 Lifetime of temporaries for default arguments in array copying Unknown
2108[over.match.ref] drafting Conversions to non-class prvalues in reference initialization Not resolved
2109[temp.dep.constexpr] CD4 Value dependence underspecified Unknown
2110[over.ics.rank] drafting Overload resolution for base class conversion and reference/non-reference Not resolved
2111[dcl.init.ref] NAD Array temporaries in reference binding Unknown
2112[expr.new] CD5 new auto{x} Unknown
2113[dcl.meaning] CD4 Incompete specification of types for declarators Unknown
2114[diff.cpp11.dcl.decl] CD3 Missing description of incompatibility from aggregate NSDMIs Unknown
2115[stmt.jump] open Order of implicit destruction vs release of automatic storage Not resolved
2116[dcl.init.aggr] C++17 Direct or copy initialization for omitted aggregate initializers Unknown
2117[dcl.constexpr] NAD Explicit specializations and constexpr function templates Unknown
2118[temp.friend] open Stateful metaprogramming via friend injection Not resolved
2119[class.virtual] NAD Disambiguation of multi-level covariant return type Unknown
2120[class] CD4 Array as first non-static data member in standard-layout class Clang 7
2121[expr.prim.lambda.general] CD6 More flexible lambda syntax Unknown
2122[basic.lval] CD4 Glvalues of void type Unknown
2123[stmt.dcl] open Omitted constant initialization of local static variables Not resolved
2124[defns.signature.member.templ] CD4 Signature of constructor template Unknown
2125[class.copy.elision] NAD Copy elision and comma operator Unknown
2126[expr.const] C++20 Lifetime-extended temporaries in constant expressions Clang 12
2127[temp.spec.partial] drafting Partial specialization and nullptr Not resolved
2128[dcl.init.aggr] open Imprecise rule for reference member initializer Not resolved
2129[expr.const] CD4 Non-object prvalues and constant expressions Unknown
2130[expr.new] CD4 Over-aligned types in new-expressions Unknown
2131[dcl.enum] drafting Ambiguity with opaque-enum-declaration Not resolved
2132[class.copy.ctor] NAD Deprecated default generated copy constructors Unknown
2133[conv.fctptr] CD5 Converting std::nullptr_t to bool Unknown
2134[expr.prim.general] NAD Objectless references to non-static member functions Unknown
2135[class.base.init] NAD mem-initializers for virtual bases of abstract classes Unknown
2136[basic.lookup.argdep] NAD Argument-dependent lookup and initializer lists Unknown
2137[dcl.init.list] CD4 List-initialization from object of same type Clang 20
2138[temp.expl.spec] NAD Explicit member specialization vs implicit instantiation Unknown
2139[conv.fpint] NAD Floating-point requirements for integer representation Unknown
2140[conv.lval] CD4 Lvalue-to-rvalue conversion of std::nullptr_t Clang 9
2141[expr.new] CD4 Ambiguity in new-expression with elaborated-type-specifier Clang 17
2142[basic.lookup.argdep] NAD Missing definition of associated classes and namespaces Unknown
2143[temp.dep.type] C++17 Value-dependency via injected-class-name Unknown
2144[dcl.fct.def.general] CD7 Function/variable declaration ambiguity Unknown
2145[dcl.fct.def.general] CD4 Parenthesized declarator in function definition Unknown
2146[intro.execution] CD4 Scalar object vs memory location in definition of “unsequenced” Unknown
2147[temp.deduct.call] CD4 Initializer-list arguments and pack deduction Unknown
2148[basic.start.static] drafting Thread storage duration and order of initialization Not resolved
2149[dcl.init.aggr] CD7 Brace elision and array length deduction Clang 3.1
2150[dcl.init.list] CD3 Initializer list array lifetime Unknown
2151[intro.object] CD4 Exception object is not created Unknown
2152[lex.ext] NAD Can an alternative token be used as a ud-suffix? Unknown
2153[class.mem] CD4 pure-specifier in friend declaration Unknown
2154[class.mem] CD4 Ambiguity of pure-specifier Unknown
2155[namespace.memdef] C++17 Defining classes and enumerations via using-declarations Unknown
2156[dcl.enum] CD4 Definition of enumeration declared by using-declaration Unknown
2157[dcl.type.elab] CD4 Further disambiguation of enumeration elaborated-type-specifier Clang 11
2158[class.dtor] drafting Polymorphic behavior during destruction Not resolved
2159[expr.prim.lambda.capture] NAD Lambda capture and local thread_local variables Unknown
2160[temp.func.order] open Issues with partial ordering Not resolved
2161[temp.explicit] NAD Explicit instantiation declaration and “preceding initialization” Unknown
2162[expr.prim.lambda.capture] CD3 Capturing this by reference Unknown
2163[dcl.constexpr] CD4 Labels in constexpr functions Unknown
2164[basic.scope.hiding] CD5 Name hiding and using-directives Unknown
2165[basic.scope.declarative] CD6 Namespaces, declarative regions, and translation units N/A
2166[expr.const] drafting Unclear meaning of “undefined constexpr function” Not resolved
2167[expr.const] CD4 Non-member references with lifetimes within the current evaluation Unknown
2168[dcl.init.list] review Narrowing conversions and +/- infinity Not resolved
2169[over.ics.list] open Narrowing conversions and overload resolution Not resolved
2170[basic.def.odr] CD5 Unclear definition of odr-use for arrays Clang 9
2171[class.copy.ctor] CD4 Triviality of copy constructor with less-qualified parameter Clang 15
2172[except.handle] drafting Multiple exceptions with one exception object Not resolved
2173[temp.spec.partial] open Partial specialization with non-deduced contexts Not resolved
2174[temp.friend] C++17 Unclear rules for friend definitions in templates Unknown
2175[dcl.ambig.res] CD4 Ambiguity with attribute in conversion operator declaration Unknown
2176[expr.call] CD4 Destroying the returned object when a destructor throws Unknown
2177[expr.new] CD5 Placement operator delete and parameter copies Unknown
2178[temp.param] NAD Substitution of dependent template arguments in default template arguments Unknown
2179[temp.spec.partial.general] drafting Required diagnostic for partial specialization after first use Not resolved
2180[class.copy.assign] CD4 Virtual bases in destructors and defaulted assignment operators Clang 3.0
2181[implimits] C++20 Normative requirements in an informative Annex Unknown
2182[expr.add] drafting Pointer arithmetic in array-like containers Not resolved
2183[except.spec] NAD Problems in description of potential exceptions Unknown
2184[diff.expr] CD4 Missing C compatibility entry for decrement of bool Unknown
2185[basic.fundamental] CD6 Cv-qualified numeric types Unknown
2186[expr.const] C++20 Unclear point that “preceding initialization” must precede Unknown
2187[class.protected] drafting Protected members and access via qualified-id Not resolved
2188[class.mem.general] open empty-declaration grammar ambiguity Not resolved
2189[over.call.object] open Surrogate call template Not resolved
2190[cpp.cond] open Insufficient specification of __has_include Not resolved
2191[except.spec] C++17 Incorrect result for noexcept(typeid(v)) Clang 19
2192[expr.const] open Constant expressions and order-of-eval undefined behavior Not resolved
2193[numeric.limits.members] NAD numeric_limits<int>::radix and digits Unknown
2194[over.match.list] drafting Impossible case in list initialization Not resolved
2195[dcl.type.cv] open Unsolicited reading of trailing volatile members Not resolved
2196[dcl.init] C++17 Zero-initialization with virtual base classes Unknown
2197[class.copy.ctor] C++17 Overload resolution and deleted special member functions Unknown
2198[basic.link] C++17 Linkage of enumerators Unknown
2199[dcl.typedef] CD6 Typedefs and tags Clang 3.8
2200[temp.arg.explicit] NAD Conversions in template argument deduction Unknown
2201[basic.type.qualifier] C++17 Cv-qualification of array types Unknown
2202[temp.inst] drafting When does default argument instantiation occur? Not resolved
2203[class.copy.ctor] drafting Defaulted copy/move constructors and UDCs Not resolved
2204[class.base.init] NAD Naming delegated constructors Unknown
2205[dcl.attr.grammar] C++17 Restrictions on use of alignas Unknown
2206[expr] C++17 Composite type of object and function pointers Unknown
2207[basic.stc.dynamic.allocation] CD5 Alignment of allocation function return value Unknown
2208[class.mem] NAD static_assert-declaration does not declare a member Unknown
2209[except.ctor] NAD Destruction of constructed array elements Unknown
2210[except.ctor] NAD Principal/target constructor confusion Unknown
2211[expr.prim.lambda.capture] C++17 Hiding by lambda captures and parameters Clang 8
2212[dcl.typedef] CD5 Typedef changing linkage after use Unknown
2213[dcl.type.elab] CD6 Forward declaration of partial specializations Clang 2.7
2214[basic.fundamental] C++17 Missing requirement on representation of integer values Unknown
2215[expr.call] C++17 Redundant description of language linkage in function call Unknown
2216[except.spec] NAD Exception specifications in unevaluated contexts Unknown
2217[dcl.constexpr] NAD constexpr constructors for non-literal types Unknown
2218[basic.lookup] C++17 Ambiguity and namespace aliases Unknown
2219[except.handle] drafting Dynamically-unreachable handlers Not resolved
2220[stmt.ranged] C++17 Hiding index variable in range-based for Unknown
2221[dcl.fct.def.default] CD6 Copying volatile objects Unknown
2222[temp.inst] drafting Additional contexts where instantiation is not required Not resolved
2223[dcl.align] drafting Multiple alignas specifiers Not resolved
2224[expr.static.cast] C++17 Member subobjects and base-class casts Unknown
2225[expr.reinterpret.cast] NAD reinterpret_cast to same floating-point type Unknown
2226[expr.cond] CD5 Xvalues vs lvalues in conditional expressions Unknown
2227[class.base.init] CD5 Destructor access and default member initializers Unknown
2228[dcl.ambig.res] review Ambiguity resolution for cast to function type Not resolved
2229[class.bit] CD5 Volatile unnamed bit-fields Clang 7
2230[basic.link] NAD Linkage of extern "C" function in unnamed namespace Unknown
2231[expr.ref] NAD Class member access to static data member template Unknown
2232[dcl.stc] open thread_local anonymous unions Not resolved
2233[dcl.fct.default] CD5 Function parameter packs following default arguments Clang 11
2234[class] CD5 Missing rules for simple-template-id as class-name Unknown
2235[temp.deduct.partial] CD5 Partial ordering and non-dependent types Unknown
2236[temp.alias] drafting When is an alias template specialization dependent? Not resolved
2237[class.ctor] CD5 Can a template-id name a constructor? Unknown
2238[basic.stc.dynamic.allocation] NAD Contradictory alignment requirements for allocation Unknown
2239[expr.delete] NAD Sized deallocation with a trivial destructor Unknown
2240[basic.def.odr] NAD this is not odr-used in a constant expression Unknown
2241[expr.call] CD5 Overload resolution is not invoked with a single function Unknown
2242[basic.def.odr] C++23 ODR violation with constant initialization possibly omitted Unknown
2243[expr.static.cast] drafting Incorrect use of implicit conversion sequence Not resolved
2244[class.protected] open Base class access in aggregate initialization Not resolved
2245[temp.point] drafting Point of instantiation of incomplete class template Not resolved
2246[class.access.base] drafting Access of indirect virtual base class constructors Not resolved
2247[expr.prim.lambda.capture] C++17 Lambda capture and variable argument list Unknown
2248[expr.delete] C++17 Problems with sized delete Unknown
2249[expr.prim.id.unqual] CD5 identifiers and id-expressions Unknown
2250[temp.point] open Implicit instantiation, destruction, and TUs Not resolved
2251[dcl.init.list] C++17 Unreachable enumeration list-initialization Unknown
2252[dcl.init.list] CD7 Enumeration list-initialization from the same type Unknown
2253[class.bit] CD5 Unnamed bit-fields and zero-initialization Unknown
2254[class.mem] CD5 Standard-layout classes and bit-fields Unknown
2255[temp.spec] CD5 Instantiated static data member templates Unknown
2256[basic.life] CD5 Lifetime of trivially-destructible objects Unknown
2257[class.temporary] CD5 Lifetime extension of references vs exceptions Unknown
2258[basic.life] open Storage deallocation during period of destruction Not resolved
2259[dcl.ambig.res] C++17 Unclear context describing ambiguity Unknown
2260[temp.expl.spec] CD5 Explicit specializations of deleted member functions Unknown
2261[temp.friend] extension Explicit instantiation of in-class friend definition Extension
2262[dcl.asm] C++17 Attributes for asm-definition Unknown
2263[temp.inst] drafting Default argument instantiation for friends Not resolved
2264[class.copy.ctor] drafting Memberwise copying with indeterminate value Not resolved
2265[temp.inst] drafting Delayed pack expansion and member redeclarations Not resolved
2266[temp.dep.type] CD5 Has dependent type vs is type-dependent Unknown
2267[dcl.init.ref] CD5 Copy-initialization of temporary in reference direct-initialization No
2268[dcl.constexpr] C++17 Unions with mutable members in constant expressions revisited Unknown
2269[dcl.init.aggr] dup Additional recursive references in aggregate DMIs Unknown
2270[temp.explicit] NAD Non-inline functions and explicit instantiation declarations Unknown
2271[class.ctor] C++17 Aliasing this Unknown
2272[dcl.init.aggr] C++17 Implicit initialization of aggregate members of reference type Unknown
2273[class.ctor] CD5 Inheriting constructors vs implicit default constructor Clang 3.3
2274[stmt.if] NAD Generic lambda capture vs constexpr if Unknown
2275[temp.dep.expr] drafting Type-dependence of function template Not resolved
2276[temp.dep.constexpr] C++17 Dependent noexcept and function type-dependence Unknown
2277[over.ics.rank] CD5 Ambiguity inheriting constructors with default arguments Partial
2278[expr.const] CD5 Copy elision in constant expressions reconsidered Unknown
2279[dcl.attr.grammar] NAD Multiple attribute-specifiers in one attribute-list Unknown
2280[expr.new] C++20 Matching a usual deallocation function with placement new Unknown
2281[expr.new] drafting Consistency of aligned operator delete replacement Not resolved
2282[expr.new] C++20 Consistency with mismatched aligned/non-over-aligned allocation/deallocation functions Unknown
2283[expr.call] CD7 Missing complete type requirements Unknown
2284[expr.call] open Sequencing of braced-init-list arguments Not resolved
2285[dcl.struct.bind] CD5 Issues with structured bindings Clang 4
2286[expr.assign] NAD Assignment evaluation order Unknown
2287[basic.compound] CD5 Pointer-interconvertibility in non-standard-layout unions Unknown
2288[dcl.pre] NAD Contradictory optionality in simple-declaration Unknown
2289[basic.scope.declarative] CD5 Uniqueness of structured binding names Unknown
2290[over.match.funcs] CD5 Unclear specification for overload resolution and deleted special member functions Unknown
2291[over.best.ics] dup Implicit conversion sequences in non-call contexts Unknown
2292[temp.names] CD5 simple-template-id is ambiguous between class-name and type-name Clang 9
2293[class] CD5 Requirements for simple-template-id used as a class-name Unknown
2294[temp.dep.expr] CD5 Dependent auto static data members Unknown
2295[dcl.init.aggr] CD5 Aggregates with deleted defaulted constructors Unknown
2296[temp.deduct] open Are default argument instantiation failures in the “immediate context”? Not resolved
2297[intro.races] open Unclear specification of atomic operations Not resolved
2298[intro.races] open Actions and expression evaluation Not resolved
2299[dcl.constexpr] CD5 constexpr vararg functions Unknown
2300[basic.def.odr] CD5 Lambdas in multiple definitions Unknown
2301[expr.const] open Value-initialization and constexpr constructor evaluation Not resolved
2302[expr.eq] NAD Address comparison between different member subobjects Unknown
2303[temp.deduct.call] CD5 Partial ordering and recursive variadic inheritance Clang 12
2304[over.best.ics] NAD Incomplete type vs overload resolution Clang 2.8
2305[temp.explicit] CD5 Explicit instantiation of constexpr or inline variable template Unknown
2306[temp.friend] NAD Nested friend templates of class templates Unknown
2307[temp.dep.type] CD5 Unclear definition of “equivalent to a nontype template parameter” Unknown
2308[dcl.struct.bind] NAD Structured bindings and lambda capture Unknown
2309[dcl.constexpr] CD5 Restrictions on nested statements within constexpr functions Unknown
2310[conv.ptr] CD5 Type completeness and derived-to-base pointer conversions Partial
2311[over.match.list] open Missed case for guaranteed copy elision Not resolved
2312[dcl.struct.bind] CD6 Structured bindings and mutable Unknown
2313[dcl.struct.bind] CD5 Redeclaration of structured binding reference variables Unknown
2314[dcl.struct.bind] dup Structured bindings and lambda capture Unknown
2315[class.copy.ctor] CD5 What is the “corresponding special member” of a variant member? Unknown
2316[expr.cond] drafting Simplifying class conversions in conditional expressions Not resolved
2317[class.base.init] CD5 Self-referential default member initializers Unknown
2318[temp.deduct.type] CD5 Nondeduced contexts in deduction from a braced-init-list Unknown
2319[over.best.ics] drafting Nested brace initialization from same type Not resolved
2320[stmt.if] extension constexpr if and boolean conversions Extension
2321[expr.cond] CD5 Conditional operator and cv-qualified class prvalues Unknown
2322[temp.deduct] CD5 Substitution failure and lexical order Unknown
2323[basic.types] C++20 Expunge POD Unknown
2324[intro.object] drafting Size of base class subobject Not resolved
2325[intro.object] drafting std::launder and reuse of character buffers Not resolved
2326[temp.deduct.call] dup Type deduction with initializer list containing ambiguous functions Unknown
2327[dcl.init] drafting Copy elision for direct-initialization with a conversion function Not resolved
2328[temp.deduct.type] drafting Unclear presentation style of template argument deduction rules Not resolved
2329[class.copy.assign] open Virtual base classes and generated assignment operators Not resolved
2330[temp.spec] CD5 Missing references to variable templates Unknown
2331[basic.scope.class] CD6 Redundancy in description of class scope N/A
2332[dcl.type.simple] CD5 template-name as simple-type-name vs injected-class-name Unknown
2333[lex.ccon] CD6 Escape sequences in UTF-8 character literals Unknown
2334[intro.object] open Creation of objects by typeid Not resolved
2335[class.static.data] drafting Deduced return types vs member types @@ -13849,1308 +16142,1526 @@

C++ defect report implementation status

2336[except.spec] CD5 Destructor characteristics vs potentially-constructed subobjects Unknown
2337[over.ics.rank] open Incorrect implication of logic ladder for conversion sequence tiebreakers Not resolved
2338[expr.static.cast] CD5 Undefined behavior converting to short enums with fixed underlying types Clang 12
2339[dcl.struct.bind] CD5 Underspecified template arguments in structured bindings Unknown
2340[dcl.struct.bind] open Reference collapsing and structured bindings Not resolved
2341[dcl.pre] CD5 Structured bindings with static storage duration Unknown
2342[expr.reinterpret.cast] CD5 Reference reinterpret_cast and pointer-interconvertibility Unknown
2343[temp.param] C++20 void* non-type template parameters Unknown
2344[stmt.select] NAD Redeclaration of names in init-statements Unknown
2345[stmt.if] CD5 Jumping across initializers in init-statements and conditions Unknown
2346[dcl.fct.default] CD5 Local variables in default arguments Clang 11
2347[expr.call] C++20 Passing short scoped enumerations to ellipsis Unknown
2348[stmt.if] NAD Non-templated constexpr if Unknown
2349[stmt] NAD Class/enumeration names vs conditions Unknown
2350[temp.deduct.partial] NAD Forwarding references and deduction guides Unknown
2351[expr.type.conv] CD5 void{} Clang 20
2352[dcl.init.ref] CD5 Similar types and reference binding Clang 10
2353[basic.def.odr] CD5 Potential results of a member access expression for a static data member Clang 9
2354[basic.align] CD5 Extended alignment and object representation Clang 15
2355[temp.deduct.type] CD6 Deducing noexcept-specifiers Unknown
2356[over.match.funcs] CD5 Base class copy and move constructors should not be inherited Clang 4
2357[basic.lookup.unqual] NAD Lookup in member function declarations Unknown
2358[expr.prim.lambda.capture] CD5 Explicit capture of value Clang 16
2359[dcl.init.aggr] CD5 Unintended copy initialization with designated initializers Unknown
2360[dcl.attr.unused] CD5 [[maybe_unused]] and structured bindings Unknown
2361[csetjmp.syn] open Unclear description of longjmp undefined behavior Not resolved
2362[dcl.fct.def.general] open __func__ should be constexpr Not resolved
2363[class.friend] NAD Opaque enumeration friend declarations Clang 19
2364[expr.const] NAD Constant expressions, aggregate initialization, and modifications Unknown
2365[expr.dynamic.cast] CD5 Confusing specification for dynamic_cast Unknown
2366[basic.start.static] CD5 Can default initialization be constant initialization? Unknown
2367[basic.def.odr] NAD Lambdas in default arguments vs the ODR Unknown
2368[expr.const] CD5 Differences in relational and three-way constant comparisons Unknown
2369[temp.deduct] CD6 Ordering between constraints and substitution Partial
2370[basic.lookup.unqual] CD6 friend declarations of namespace-scope functions No
2371[basic.def] CD5 Use of the English term “attributes” is confusing Unknown
2372[basic.link] CD5 Incorrect matching rules for block-scope extern declarations Unknown
2373[temp.func.order] CD5 Incorrect handling of static member function templates in partial ordering Unknown
2374[dcl.init.list] C++20 Overly permissive specification of enum direct-list-initialization Unknown
2375[class.static.data] NAD Multiple redeclarations of constexpr static data members Unknown
2376[over.match.class.deduct] CD5 Class template argument deduction with array declarator Clang 21
2377[over.match.viable] NAD Explicit copy constructor vs function viability Unknown
2378[expr.prim.lambda.capture] C++20 Inconsistent grammar for reference init-capture of pack Unknown
2379[temp.friend] CD5 Missing prohibition against constexpr in friend declaration Unknown
2380[basic.def.odr] CD5 capture-default makes too many references odr-usable Unknown
2381[expr.type] CD5 Composite pointer type of pointers to plain and noexcept member functions Unknown
2382[expr.new] CD5 Array allocation overhead for non-allocating placement new Unknown
2383[temp.param] NAD Variadic member functions of variadic class templates Unknown
2384[temp.deduct.conv] CD5 Conversion function templates and qualification conversions Unknown
2385[expr.prim.id.qual] CD5 Lookup for conversion-function-ids N/A
2386[dcl.struct.bind] CD5 tuple_size requirements for structured binding Clang 9
2387[basic.link] CD5 Linkage of const-qualified variable template Clang 9
2388[dcl.attr.grammar] NAD Applicability of contract-attribute-specifiers Unknown
2389[dcl.spec.auto] CD6 Agreement of deduced and explicitly-specified variable types Unknown
2390[cpp.cond] CD5 Is the argument of __has_cpp_attribute macro-expanded? Clang 14
2391[temp.variadic] dup Additional template parameters following pack expansion Unknown
2392[expr.const] C++23 new-expression size check and constant evaluation Unknown
2393[expr.pseudo] NAD Pseudo-destructors and object lifetime Unknown
2394[class.default.ctor] CD5 Const-default-constructible for members Clang 15
2395[temp.param] drafting Parameters following a pack expansion Not resolved
2396[expr.prim.id.qual] CD6 Lookup of names in complex conversion-type-ids No
2397[dcl.array] CD6 auto specifier for pointers and references to arrays Clang 17
2398[temp.arg.template] drafting Template template parameter matching and deduction Not resolved
2399[expr.assign] CD5 Unclear referent of “expression” in assignment-expression Unknown
2400[expr.const] CD5 Constexpr virtual functions and temporary objects Unknown
2401[temp.arg.nontype] C++20 Array decay vs prohibition of subobject non-type arguments Unknown
2402[lex.ccon] CD6 When is the restriction to a single c-char in a Unicode literal enforced? Unknown
2403[class.base.init] drafting Temporary materialization and base/member initialization Not resolved
2404[class.mem] CD5 [[no_unique_address]] and allocation order Unknown
2405[temp.dep.expr] CD6 Additional type-dependent expressions Unknown
2406[dcl.attr.fallthrough] CD5 [[fallthrough]] attribute and iteration statements Clang 5
2407[diff] C++23 Missing entry in Annex C for defaulted comparison operators Unknown
2408[dcl.init.aggr] NAD Temporaries and previously-initialized elements in aggregate initialization Unknown
2409[temp.expl.spec] drafting Explicit specializations of constexpr static data members Not resolved
2410[dcl.constexpr] C++23 Implicit calls of immediate functions Unknown
2411[temp.type] C++20 Comparison of pointers to members in template non-type arguments Unknown
2412[dcl.spec.auto] review SFINAE vs undeduced placeholder type Not resolved
2413[temp.res] CD6 typename in conversion-function-ids Unknown
2414[class.compare.default] C++20 Unclear results if both member and friend operator<=> are declared Unknown
2415[class.copy.assign] NAD using-declarations vs copy assignment operators Unknown
2416[temp.expl.spec] C++20 Explicit specializations vs constexpr and consteval Unknown
2417[except.spec] open Explicit instantiation and exception specifications Not resolved
2418[expr.const] CD5 Missing cases in definition of “usable in constant expressions” Unknown
2419[expr.add] C++20 Loss of generality treating pointers to objects as one-element arrays Unknown
2420[except.spec] dup Exception specifications in explicit instantiation Unknown
2421[temp.explicit] drafting Explicit instantiation of constrained member functions Not resolved
2422[temp.deduct.guide] C++20 Incorrect grammar for deduction-guide Unknown
2423[basic.pre] NAD Typedefs, names, and entities Unknown
2424[dcl.constexpr] C++20 constexpr initialization requirements for variant members Unknown
2425[over.match.class.deduct] open Confusing wording for deduction from a type Not resolved
2426[except.ctor] C++20 Reference to destructor that cannot be invoked Unknown
2427[expr.assign] C++20 Deprecation of volatile operands and unevaluated contexts Unknown
2428[temp.concept] C++23 Deprecating a concept Clang 19
2429[stmt.dcl] C++20 Initialization of thread_local variables referenced by lambdas Unknown
2430[class.mem] C++20 Completeness of return and parameter types of member functions Clang 2.7
2431[basic.exec] C++20 Full-expressions and temporaries bound to references Unknown
2432[class.spaceship] C++20 Return types for defaulted <=> Unknown
2433[basic.def.odr] C++20 Variable templates in the ODR Unknown
2434[class.temporary] review Mandatory copy elision vs non-class objects Not resolved
2435[temp.spec] open Alias template specializations Not resolved
2436[dcl.fct.def.coroutine] C++20 Copy semantics of coroutine parameters Unknown
2437[class.spaceship] C++20 Conversion of std::strong_ordering in a defaulted operator<=> Unknown
2438[conv.qual] open Problems in the specification of qualification conversions Not resolved
2439[expr.const] C++20 Undefined term in definition of “usable in constant expressions” Unknown
2440[expr.const] C++23 Allocation in core constant expressions Unknown
2441[dcl.inline] C++20 Inline function parameters Unknown
2442[over.match.viable] C++20 Incorrect requirement for default arguments Unknown
2443[module.interface] C++23 Meaningless template exports Unknown
2444[basic.start.dynamic] drafting Constant expressions in initialization odr-use Not resolved
2445[temp.func.order] C++20 Partial ordering with rewritten candidates Clang 19
2446[temp.dep.expr] C++20 Questionable type-dependency of concept-ids Unknown
2447[dcl.spec.auto] C++20 Unintended description of abbreviated function templates Unknown
2448[basic.fundamental] CD6 Cv-qualification of arithmetic types and deprecation of volatile Unknown
2449[expr.unary.op] extension Thunks as an implementation technique for pointers to virtual functions Extension
2450[temp.names] CD7 braced-init-list as a template-argument Clang 18
2451[dcl.fct.def.coroutine] C++23 promise.unhandled_exception() and final suspend point Unknown
2452[stmt.return.coroutine] CD6 Flowing off the end of a coroutine Unknown
2453[dcl.spec.auto.general] NAD Deduced return types and coroutine lambdas Unknown
2454[expr.await] NAD Tail recursion and coroutine symmetric transfer Unknown
2455[lex.phases] CD6 Concatenation of string literals vs translation phases 5 and 6 Unknown
2456[expr.const] open Viable user-defined conversions in converted constant expressions Not resolved
2457[temp.dep.type] CD6 Unexpanded parameter packs don't make a function type dependent Unknown
2458[expr.ref] CD6 Value category of expressions denoting non-static member functions Unknown
2459[temp.arg.nontype] CD7 Template parameter initialization Clang 18
2460[dcl.link] CD6 C language linkage and constrained non-template friends Unknown
2461[temp.res] CD6 Diagnosing non-bool type constraints Unknown
2462[temp.res.general] open Problems with the omission of the typename keyword Not resolved
2463[class.prop] open Trivial copyability and unions with non-trivial members Not resolved
2464[ptr.launder] CD6 Constexpr launder and unions Unknown
2465[dcl.fct.def.coroutine] CD6 Coroutine parameters passed to a promise constructor Unknown
2466[expr.await] CD6 co_await should be a single evaluation Unknown
2467[over.match.class.deduct] drafting CTAD for alias templates and the deducible check Not resolved
2468[temp.res.general] open Omission of the typename keyword in a member template parameter list Not resolved
2469[intro.object] drafting Implicit object creation vs constant expressions Not resolved
2470[intro.object] CD6 Multiple array objects providing storage for one object Unknown
2471[over.match.class.deduct] drafting Nested class template argument deduction Not resolved
2472[expr.await] NAD Value categories in await-expressions Unknown
2473[expr.prim.id.dtor] open Parentheses in pseudo-destructor calls Not resolved
2474[expr.delete] CD6 Cv-qualification and deletion Unknown
2475[basic.fundamental] C++23 Object declarations of type cv void Unknown
2476[dcl.spec.auto.general] CD7 placeholder-type-specifiers and function declarators Unknown
2477[class.copy.ctor] CD6 Defaulted vs deleted copy constructors/assignment operators Unknown
2478[temp.expl.spec] C++23 Properties of explicit specializations of implicitly-instantiated class templates Unknown
2479[basic.start.main] CD6 Missing specifications for consteval and constinit Unknown
2480[basic.lookup.general] drafting Lookup for enumerators in modules Not resolved
2481[dcl.init.ref] CD6 Cv-qualification of temporary to which a reference is bound Unknown
2482[bit.cast] CD6 bit_cast and indeterminate values Unknown
2483[dcl.link] C++23 Language linkage of static member functions Unknown
2484[conv.prom] CD6 char8_t and char16_t in integral promotions Unknown
2485[conv.prom] CD7 Bit-fields in integral promotions Unknown
2486[expr.call] CD6 Call to noexcept function via noexcept(false) pointer/lvalue Clang 4 (C++17 onwards)
2487[temp.dep.expr] drafting Type dependence of function-style cast to incomplete array type Not resolved
2488[basic.scope.scope] open Overloading virtual functions and functions with trailing requires-clauses Not resolved
2489[intro.object] C++23 Storage provided by array of char Unknown
2490[expr.const] CD6 Restrictions on destruction in constant expressions Unknown
2491[module.interface] CD6 Export of typedef after its first declaration Unknown
2492[over.ics.list] open Comparing user-defined conversion sequences in list-initialization Not resolved
2493[dcl.spec.auto.general] dup auto as a conversion-type-id Unknown
2494[basic.def.odr] CD6 Multiple definitions of non-odr-used entities Unknown
2495[stmt.return] open Glvalue result of a function call Not resolved
2496[class.virtual] CD6 ref-qualifiers and virtual overriding Clang 21
2497[temp.point] drafting Points of instantiation for constexpr function templates Not resolved
2498[temp.deduct.general] open Partial specialization failure and the immediate context Not resolved
2499[basic.compound] CD6 Inconsistency in definition of pointer-interconvertibility Unknown
2500[expr.static.cast] extension noexcept(false) functions and noexcept expressions Extension
2501[temp.explicit] drafting Explicit instantiation and trailing requires-clauses Not resolved
2502[basic.scope.block] CD6 Unintended declaration conflicts in nested statement scopes Unknown
2503[expr.prim.id] drafting Unclear relationship among name, qualified name, and unqualified name Not resolved
2504[class.inhctor.init] CD7 Inheriting constructors from virtual base classes No
2505[namespace.unnamed] drafting Nested unnamed namespace of inline unnamed namespace Not resolved
2506[dcl.struct.bind] CD6 Structured bindings and array cv-qualifiers Unknown
2507[over.oper.general] CD6 Default arguments for operator[] Unknown
2508[temp.local] C++23 Restrictions on uses of template parameter names Unknown
2509[expr.prim.lambda.general] CD6 decl-specifier-seq in lambda-specifiers Unknown
2510[class.mem.general] NAD noexcept-specifier of friend function vs class completeness Unknown
2511[class.bit] CD6 cv-qualified bit-fields Unknown
2512[expr.typeid] NAD typeid and incomplete class types Clang 2.7
2513[class.conv.fct] open Ambiguity with requires-clause and operator-function-id Not resolved
2514[basic.life] open Modifying const subobjects Not resolved
2515[expr.call] open Result of a function call Not resolved
2516[basic.scope.pdecl] C++23 Locus of enum-specifier or opaque-enum-declaration Clang 3.0
2517[expr.prim.req.nested] C++23 Useless restriction on use of parameter in constraint-expression Clang 21
2518[intro.compliance.general] C++23 Conformance requirements and #error/#warning Clang 17
2519[basic.types.general] CD7 Object representation of a bit-field Unknown
2520[defns.signature.templ] C++23 Template signature and default template arguments Unknown
2521[over.literal] C++23 User-defined literals and reserved identifiers Clang 17
2522[cpp.concat] open Removing placemarker tokens and retention of whitespace Not resolved
2523[expr.const] C++23 Undefined behavior via omitted destructor call in constant expressions Unknown
2524[over.ics.rank] NAD Distinguishing user-defined conversion sequences by ref-qualifier Unknown
2525[over.best.ics.general] open Incorrect definition of implicit conversion sequence Not resolved
2526[expr.rel] C++23 Relational comparison of void* pointers Unknown
2527[dcl.attr.nouniqueaddr] NAD Non-class potentially-overlapping objects Unknown
2528[expr.arith.conv] C++23 Three-way comparison and the usual arithmetic conversions Unknown
2529[expr.const] C++23 Constant destruction of constexpr references Unknown
2530[basic.def.odr] C++23 Multiple definitions of enumerators Unknown
2531[dcl.constexpr] CD7 Static data members redeclared as constexpr Unknown
2532[expr.new] open Kind of pointer value returned by new T[0] Not resolved
2533[basic.stc] CD7 Storage duration of implicitly created objects Unknown
2534[expr.ref] CD6 Value category of pseudo-destructor expression Unknown
2535[expr.ref] CD6 Type punning in class member access Unknown
2536[expr.const] open Partially initialized variables during constant initialization Not resolved
2537[dcl.fct] drafting Overbroad grammar for parameter-declaration Not resolved
2538[dcl.attr.grammar] C++23 Can standard attributes be syntactically ignored? Unknown
2539[class.spaceship] C++23 Three-way comparison requiring strong ordering for floating-point types Unknown
2540[lex.ccon] CD6 Unspecified interpretation of numeric-escape-sequence Unknown
2541[module.unit] open Linkage specifications, module purview, and module attachment Not resolved
2542[expr.prim.lambda.closure] CD7 Is a closure type a structural type? Unknown
2543[dcl.constinit] C++23 constinit and optimized dynamic initialization Unknown
2544[basic.compound] open Address of past-the-end of a potentially-overlapping subobject Not resolved
2545[expr.const] open Transparently replacing objects in constant expressions Not resolved
2546[class.compare.secondary] CD7 Defaulted secondary comparison operators defined as deleted Unknown
2547[dcl.fct.def.default] CD7 Defaulted comparison operator function for non-classes Clang 20
2548[expr.add] NAD Array prvalues and additive operators Unknown
2549[expr.prim.id.qual] CD7 Implicitly moving the operand of a throw-expression in unevaluated contexts Unknown
2550[dcl.ref] CD7 Type "reference to cv void" outside of a declarator Unknown
2551[basic.life] review "Refers to allocated storage" has no meaning Not resolved
2552[expr.const] CD7 Constant evaluation of non-defining variable declarations Unknown
2553[dcl.fct] review Restrictions on explicit object member functions @@ -15161,6 +17672,7 @@

C++ defect report implementation status

2554[class.virtual] review Overriding virtual functions, also with explicit object parameters @@ -15171,66 +17683,77 @@

C++ defect report implementation status

2555[namespace.udecl] tentatively ready Ineffective redeclaration prevention for using-declarators Not resolved
2556[stmt.return.coroutine] CD7 Unusable promise::return_void Unknown
2557[expr.ref] review Class member access referring to an unrelated class Not resolved
2558[expr.const] C++23 Uninitialized subobjects as a result of an immediate invocation Unknown
2559[expr.const] open Defaulted consteval functions Not resolved
2560[expr.prim.req.general] CD7 Parameter type determination in a requirement-parameter-list Unknown
2561[expr.prim.lambda.closure] CD7 Conversion to function pointer for lambda with explicit object parameter No
2562[dcl.fct.def.coroutine] open Exceptions thrown during coroutine startup Not resolved
2563[dcl.fct.def.coroutine] review Initialization of coroutine result object Not resolved
2564[over.call.object] drafting Conversion to function pointer with an explicit object parameter Not resolved
2565[expr.prim.req.general] open Invalid types in the parameter-declaration-clause of a requires-expression @@ -15241,1231 +17764,1435 @@

C++ defect report implementation status

2566[expr.new] review Matching deallocation for uncaught exception Not resolved
2567[class.member.lookup] NAD Operator lookup ambiguity Unknown
2568[class.compare.default] CD7 Access checking during synthesis of defaulted comparison operator Unknown
2569[expr.prim.id.unqual] CD6 Use of decltype(capture) in a lambda's parameter-declaration-clause Unknown
2570[dcl.fct.def.default] CD7 Clarify constexpr for defaulted functions Unknown
2571[expr.sub] CD6 Evaluation order for subscripting Unknown
2572[over.over] review Address of overloaded function with no target Not resolved
2573[lex.phases] CD7 Undefined behavior when splicing results in a universal-character-name Unknown
2574[lex.pptoken] CD7 Undefined behavior when lexing unmatched quotes Unknown
2575[cpp.cond] open Undefined behavior when macro-replacing "defined" operator Not resolved
2576[cpp.include] open Undefined behavior with macro-expanded #include directives Not resolved
2577[cpp.replace.general] open Undefined behavior for preprocessing directives in macro arguments Not resolved
2578[cpp.stringize] CD7 Undefined behavior when creating an invalid string literal via stringizing Unknown
2579[cpp.concat] CD7 Undefined behavior when token pasting does not create a preprocessing token Unknown
2580[cpp.line] CD7 Undefined behavior with #line Unknown
2581[cpp.predefined] open Undefined behavior for predefined macros Not resolved
2582[class.member.lookup] CD6 Differing member lookup from nested classes Unknown
2583[class.mem.general] C++23 Common initial sequence should consider over-alignment Clang 19
2584[temp.over.link] open Equivalent types in function template declarations Not resolved
2585[dcl.fct.def.coroutine] CD6 Name lookup for coroutine allocation Unknown
2586[class.compare.default] CD6 Explicit object parameter for assignment and comparison Clang 20
2587[intro.races] review Visible side effects and initial value of an object Not resolved
2588[class.friend] CD7 friend declarations and module linkage Unknown
2589[temp.constr.atomic] review Context of access checks during constraint satisfaction checking Not resolved
2590[dcl.enum] C++23 Underlying type should determine size and alignment requirements of an enum Unknown
2591[class.union.general] CD7 Implicit change of active union member for anonymous union in union Unknown
2592[expr.new] open Missing definition for placement allocation/deallocation function Not resolved
2593[expr.mptr.oper] review Insufficient base class restriction for pointer-to-member expression Not resolved
2594[basic.start.main] CD6 Disallowing a global function template main Unknown
2595[special] CD7 "More constrained" for eligible special member functions Unknown
2596[temp.inst] drafting Instantiation of constrained non-template friends Not resolved
2597[module.unit] CD6 Replaceable allocation and deallocation functions in the global module Unknown
2598[basic.types.general] C++23 Unions should not require a non-static data member of literal type Clang 18
2599[expr.call] C++23 What does initializing a parameter include? Unknown
2600[temp.dep.expr] CD7 Type dependency of placeholder types Unknown
2601[except.ctor] C++23 Tracking of created and destroyed subobjects Unknown
2602[dcl.constexpr] C++23 consteval defaulted functions Unknown
2603[temp.over.link] C++23 Holistic functional equivalence for function templates Unknown
2604[temp.expl.spec] C++23 Attributes for an explicit specialization Unknown
2605[class.prop] C++23 Implicit-lifetime aggregates Unknown
2606[expr.static.cast] CD6 static_cast from "pointer to void" does not handle similar types Unknown
2607[module.interface] drafting Visibility of enumerator names Not resolved
2608[temp.arg.explicit] CD6 Omitting an empty template argument list Unknown
2609[expr.sizeof] open Padding in class types Not resolved
2610[dcl.init.aggr] C++23 Indirect private base classes in aggregates Unknown
2611[temp.variadic] C++23 Missing parentheses in expansion of fold-expression could cause syntactic reinterpretation Unknown
2612[dcl.init.general] C++23 Incorrect comment in example Unknown
2613[dcl.fct.def.coroutine] C++23 Incomplete definition of resumer Unknown
2614[expr.ref] C++23 Unspecified results for class member access Unknown
2615[cpp.cond] C++23 Missing __has_cpp_attribute(assume) Unknown
2616[stmt] C++23 Imprecise restrictions on break and continue Unknown
2617[temp.param] review Default template arguments for template members of non-template classes Not resolved
2618[temp.deduct.general] C++23 Substitution during deduction should exclude exception specifications Unknown
2619[dcl.init.aggr] C++23 Kind of initialization for a designated-initializer-list Unknown
2620[dcl.ambig.res] C++23 Nonsensical disambiguation rule Unknown
2621[enum.udecl] C++23 Kind of lookup for using enum declarations Superseded by 2877
2622[implimits] C++23 Compounding types from function and pointer-to-member types Unknown
2623[expr.new] drafting Invoking destroying operator delete for constructor failure Not resolved
2624[expr.delete] C++23 Array delete expression with no array cookie Unknown
2625[basic.life] C++23 Deletion of pointer to out-of-lifetime object Unknown
2626[expr.unary.op] C++23 Rephrase ones' complement using base-2 representation Unknown
2627[dcl.init.list] C++23 Bit-fields and narrowing conversions Clang 20
2628[over.match.class.deduct] CD7 Implicit deduction guides should propagate constraints Clang 20
2629[stmt.switch] C++23 Variables of floating-point type as switch conditions Unknown
2630[class.mem.general] C++23 Syntactic specification of class completeness Clang 9
2631[expr.const] C++23 Immediate function evaluations in default arguments Clang 16
2632[intro.defs] drafting 'user-declared' is not defined Not resolved
2633[expr.const] open typeid of constexpr-unknown dynamic type Not resolved
2634[dcl.type.elab] CD7 Avoid circularity in specification of scope for friend class declarations Unknown
2635[dcl.pre] C++23 Constrained structured bindings Clang 16
2636[ub] C++23 Update Annex E based on Unicode 15.0 UAX #31 N/A
2637[class.pre] CD7 Injected-class-name as a simple-template-id Unknown
2638[dcl.init.list] CD7 Improve the example for initializing by initializer list Unknown
2639[lex.phases] C++23 new-lines after phase 1 Unknown
2640[lex.charset] C++23 Allow more characters in an n-char sequence Clang 16
2641[lex.literal] C++23 Redundant specification of value category of literals Unknown
2642[class.member.lookup] C++23 Inconsistent use of T and C N/A
2643[basic.types.general] C++23 Completing a pointer to array of unknown bound Unknown
2644[expr.prim.lambda.capture] C++23 Incorrect comment in example Clang 8
2645[expr.call] C++23 Unused term "default argument promotions" Unknown
2646[dcl.fct.def.default] C++23 Defaulted special member functions Unknown
2647[expr.const] C++23 Fix for "needed for constant evaluation" Unknown
2648[over.call] C++23 Correspondence of surrogate call function and conversion function Unknown
2649[over.call.object] C++23 Incorrect note about implicit conversion sequence Unknown
2650[temp.deduct.general] C++23 Incorrect example for ill-formed non-type template arguments Clang 17
2651[temp.deduct.conv] C++23 Conversion function templates and "noexcept" Unknown
2652[cpp.predefined] C++23 Overbroad definition of __STDCPP_BFLOAT16_T__ Unknown
2653[dcl.fct] C++23 Can an explicit object parameter have a default argument? Clang 18
2654[expr.assign] C++23 Un-deprecation of compound volatile assignments Clang 16
2655[temp.inst] NAD Instantiation of default arguments in lambda-expressions Unknown
2656[expr.const] drafting Converting consteval lambda to function pointer in non-immediate context Not resolved
2657[dcl.init.ref] CD7 Cv-qualification adjustment when binding reference to temporary Unknown
2658[expr.const] C++23 Trivial copying of unions in core constant expressions Unknown
2659[cpp.predefined] C++23 Missing feature-test macro for lifetime extension in range-for loop Unknown
2660[expr.call] open Confusing term "this parameter" Not resolved
2661[class.mem.general] CD7 Missing disambiguation rule for pure-specifier vs. brace-or-equal-initializer Unknown
2662[class.access.general] C++23 Example for member access control vs. overload resolution Unknown
2663[namespace.udecl] CD7 Example for member redeclarations with using-declarations Unknown
2664[over.match.class.deduct] C++23 Deduction failure in CTAD for alias templates Unknown
2665[basic.life] NAD Replacing a subobject with a complete object Unknown
2666[class.temporary] open Lifetime extension through static_cast Not resolved
2667[cpp.import] C++23 Named module imports do not import macros Unknown
2668[expr.await] CD7 co_await in a lambda-expression Unknown
2669[class.base.init] open Lifetime extension for aggregate initialization Not resolved
2670[basic.link] open Programs and translation units Not resolved
2671[dcl.meaning.general] open friend named by a template-id Not resolved
2672[temp.deduct.general] CD7 Lambda body SFINAE is still required, contrary to intent and note Clang 18
2673[over.match.oper] C++23 User-declared spaceship vs. built-in operators Unknown
2674[class.ctor.general] C++23 Prohibit explicit object parameters for constructors Unknown
2675[class.union.general] open start_lifetime_as, placement-new, and active union members Not resolved
2676[basic.life] open Replacing a complete object having base subobjects Not resolved
2677[basic.life] review Replacing union subobjects Not resolved
2678[basic.def.odr] C++23 std::source_location::current is unimplementable Unknown
2679[over.best.ics.general] open Implicit conversion sequence with a null pointer constant Not resolved
2680[over.match.class.deduct] open Class template argument deduction for aggregates with designated initializers Not resolved
2681[over.match.class.deduct] C++23 Deducing member array type from string literal Clang 17
2682[temp.pre] C++23 Templated function vs. function template Unknown
2683[dcl.fct.default] CD7 Default arguments for member functions of templated nested classes Unknown
2684[basic.start.dynamic] open thread_local dynamic initialization Not resolved
2685[over.match.class.deduct] C++23 Aggregate CTAD, string, and brace elision Unknown
2686[temp.constr.constr] open Pack expansion into a non-pack parameter of a concept Not resolved
2687[over.match.call.general] C++23 Calling an explicit object member function via an address-of-overload-set Clang 18
2688[expr.call] open Calling explicit object member functions Not resolved
2689[basic.fundamental] CD7 Are cv-qualified std::nullptr_t fundamental types? Unknown
2690[class.copy.assign] C++23 Semantics of defaulted move assignment operator for unions Unknown
2691[lex.ccon] C++23 hexadecimal-escape-sequence is too greedy Unknown
2692[over.match.call.general] C++23 Static and explicit object member functions with the same parameter-type-lists Clang 19
2693[cpp.line] open Escape sequences for the string-literal of #line Not resolved
2694[cpp.pragma.op] open string-literals of the _Pragma operator Not resolved
2695[dcl.attr.grammar] C++23 Semantic ignorability of attributes Unknown
2696[expr.rel] dup Relational comparisons of pointers to void Unknown
2697[temp.deduct.guide] CD7 Deduction guides using abbreviated function syntax Unknown
2698[lex.icon] CD7 Using extended integer types with z suffix Unknown
2699[expr.throw] CD7 Inconsistency of throw-expression specification Unknown
2700[intro.compliance.general] CD7 #error disallows existing implementation practice Unknown
2701[dcl.fct.default] open Default arguments in multiple scopes / inheritance of array bounds in the same scope Not resolved
2702[expr.const] open Constant destruction of reference members Not resolved
2703[class.spaceship] CD7 Three-way comparison requiring strong ordering for floating-point types, take 2 Unknown
2704[dcl.init.ref] open Clarify meaning of "bind directly" Not resolved
2705[expr.ref] open Accessing ambiguous subobjects Not resolved
2706[basic.link] open Repeated structured binding declarations Not resolved
2707[temp.deduct.guide] CD7 Deduction guides cannot have a trailing requires-clause Clang 20
2708[dcl.init.general] CD7 Parenthesized initialization of arrays Unknown
2709[dcl.init.general] NAD Parenthesized initialization of reference-to-aggregate Unknown
2710[expr.const] CD7 Loops in constant expressions Unknown
2711[expr.throw] CD7 Source for copy-initializing the exception object Unknown
2712[over.match.oper] CD7 Simplify restrictions on built-in assignment operator candidates Unknown
2713[dcl.init.list] CD7 Initialization of reference-to-aggregate from designated initializer list Unknown
2714[over.match.class.deduct] CD7 Implicit deduction guides omit properties from the parameter-declaration-clause of a constructor Unknown
2715[expr.call] CD7 "calling function" for parameter initialization may not exist Unknown
2716[class.conv.fct] CD7 Rule about self-or-base conversion is normatively redundant Unknown
2717[temp.variadic] CD7 Pack expansion for alignment-specifier Unknown
2718[expr.static.cast] CD7 Type completeness for derived-to-base conversions Clang 2.7
2719[basic.align] CD7 Creating objects in misaligned storage Unknown
2720[temp.res.general] CD7 Template validity rules for templated entities and alias templates Unknown
2721[basic.life] CD7 When exactly is storage reused? Unknown
2722[expr.unary.noexcept] CD7 Temporary materialization conversion for noexcept operator Unknown
2723[basic.fundamental] CD7 Range of representable values for floating-point types Unknown
2724[expr.shift] CD7 Clarify rounding for arithmetic right shift Unknown
2725[expr.ref] CD7 Overload resolution for non-call of class member access Unknown
2726[lex.digraph] review Alternative tokens appearing as attribute-tokens Not resolved
2727[module.import] open Importing header units synthesized from source files Not resolved
2728[expr.delete] CD7 Evaluation of conversions in a delete-expression Unknown
2729[expr.new] CD7 Meaning of new-type-id Unknown
2730[over.match.oper] open Comparison templates on enumeration types Not resolved
2731[over.ics.user] open List-initialization sequence with a user-defined conversion Not resolved
2732[module.import] CD7 Can importable headers react to preprocessor state from point of import? Unknown
2733[dcl.attr.unused] CD7 Applying [[maybe_unused]] to a label Unknown
2734[expr.const] open Immediate forward-declared function templates Not resolved
2735[over.match.best] open List-initialization and conversions in overload resolution Not resolved
2736[class.prop] open Standard layout class with empty base class also in first member Not resolved
2737[expr.prim.lambda.capture] review Temporary lifetime extension for reference init-captures Not resolved
2738[expr.prim.id.unqual] review "denotes a destructor" is missing specification Not resolved
2739[expr.prim.req.nested] open Nested requirement not a constant expression Not resolved
2740[expr.const] open Too many objects have constexpr-unknown type Not resolved
2741[over.ics.list] open Implicit conversion sequence from empty list to array of unknown bound Not resolved
2742[dcl.init.list] drafting Guaranteed copy elision for brace-initialization from prvalue Not resolved
2743[class.copy.ctor] open Copying non-trivial objects nested within a union Not resolved
2744[intro.object] open Multiple objects of the same type at the same address Not resolved
2745[basic.def.odr] CD7 Dependent odr-use in generic lambdas Unknown
2746[temp.res.general] CD7 Checking of default template arguments Unknown
2747[lex.phases] CD7 Cannot depend on an already-deleted splice Unknown
2748[expr.ref] CD7 Accessing static data members via null pointer Unknown
2749[expr.rel] CD7 Treatment of "pointer to void" for relational comparisons Clang 20
2750[expr.const] CD7 construct_at without constructor call Unknown
2751[stmt.dcl] NAD Order of destruction for parameters for operator functions Unknown
2752[lex.fcon] open Excess-precision floating-point literals Not resolved
2753[intro.object] CD7 Storage reuse for string literal objects and backing arrays Unknown
2754[dcl.fct.def.coroutine] CD7 Using *this in explicit object member functions that are coroutines Unknown
2755[expr.const] CD7 Incorrect wording applied by P2738R1 Unknown
2756[class.init] review Completion of initialization by delegating constructor Not resolved
2757[class.cdtor] review Deleting or deallocating storage of an object during its construction Not resolved
2758[expr.delete] CD7 What is "access and ambiguity control"? Unknown
2759[class.mem.general] CD7 [[no_unique_address] and common initial sequence Clang 19
2760[expr.const] CD7 Defaulted constructor that is an immediate function Unknown
2761[class.dtor] CD7 Implicitly invoking the deleted destructor of an anonymous union member Unknown
2762[over.match.funcs.general] CD7 Type of implicit object parameter Unknown
2763[expr.const] CD7 Ignorability of [[noreturn]] during constant evaluation Unknown
2764[basic.scope.scope] CD7 Use of placeholders affecting name mangling Unknown
2765[intro.object] open Address comparisons between potentially non-unique objects during constant evaluation Not resolved
2766[lex.string] openRepeated evaluation of a string-literal may yield different -objectsRepeated evaluation of a string-literal may yield different objects Not resolved
2767[class.union.anon] open Non-defining declarations of anonymous unions Not resolved
2768[expr.assign] CD7 Assignment to enumeration variable with a braced-init-list Unknown
2769[temp.deduct.general] open Substitution into template parameters and default template arguments should be interleaved Not resolved
2770[temp.deduct.general] open Trailing requires-clause can refer to function parameters before they are substituted into @@ -16476,462 +19203,539 @@

C++ defect report implementation status

2771[class.mfct.non.static] CD7 Transformation for unqualified-ids in address operator Clang 18
2772[diff.cpp03.dcl.dcl] CD7 Missing Annex C entry for linkage effects of linkage-specification Unknown
2773[class.union.anon] open Naming anonymous union members as class members Not resolved
2774[temp.dep.constexpr] open Value-dependence of requires-expressions Not resolved
2775[except.throw] CD7 Unclear argument type for copy of exception object Unknown
2776[intro.compliance.general] open Substitution failure and implementation limits Not resolved
2777[temp.param] CD7 Type of id-expression denoting a template parameter object Unknown
2778[expr.const] review Trivial destructor does not imply constant destruction Not resolved
2779[lex.charset] open Restrictions on the ordinary literal encoding Not resolved
2780[expr.reinterpret.cast] CD7 reinterpret_cast to reference to function types Unknown
2781[basic.def.odr] open Unclear recursion in the one-definition rule Not resolved
2782[basic.def.odr] open Treatment of closure types in the one-definition rule Not resolved
2783[module.global.frag] CD7 Handling of deduction guides in global-module-fragment Unknown
2784[support.types.layout] open Unclear definition of member-designator for offsetof Not resolved
2785[temp.dep.expr] CD7 Type-dependence of requires-expression Unknown
2786[expr.eq] open Comparing pointers to complete objects Not resolved
2787[special] open Kind of explicit object copy/move assignment function Not resolved
2788[basic.scope.scope] open Correspondence and redeclarations Not resolved
2789[over.match.best.general] CD7 Overload resolution with implicit and explicit object member functions Clang 18
2790[over.ics.list] open Aggregate initialization and user-defined conversion sequence Not resolved
2791[stmt.return] CD7 Unclear phrasing about "returning to the caller" Unknown
2792[expr.unary.noexcept] CD7 Clean up specification of noexcept operator Unknown
2793[basic.scope.block] CD7 Block-scope declaration conflicting with parameter name Unknown
2794[temp.alias] open Uniqueness of lambdas in alias templates Not resolved
2795[intro.object] CD7 Overlapping empty subobjects with different cv-qualification Unknown
2796[expr.rel] CD7 Function pointer conversions for relational operators Unknown
2797[over.match.oper] review Meaning of "corresponds" for rewritten operator candidates Not resolved
2798[expr.const] CD7 Manifestly constant evaluation of the static_assert message Clang 17
2799[class.default.ctor] drafting Inheriting default constructors Not resolved
2800[expr.const] review Instantiating constexpr variables for potential constant evaluation Not resolved
2801[dcl.init.ref] CD7 Reference binding with reference-related types Unknown
2802[dcl.fct] open Constrained auto and redeclaration with non-abbreviated syntax Not resolved
2803[over.ics.ref] CD7 Overload resolution for reference binding of similar types Unknown
2804[over.match.oper] open Lookup for determining rewrite targets Not resolved
2805[expr.delete] open Underspecified selection of deallocation function Not resolved
2806[temp.res.general] CD7 Make a type-requirement a type-only context Unknown
2807[class.dtor] CD7 Destructors declared consteval Unknown
2808[temp.inst] review Explicit specialization of defaulted special member function Not resolved
2809[dcl.fct.def.default] CD7 An implicit definition does not redeclare a function Unknown
2810[temp.res.general] CD7 Requiring the absence of diagnostics for templates Unknown
2811[basic.start.main] CD7 Clarify "use" of main Clang 3.5
2812[expr.new] open Allocation with explicit alignment Not resolved
2813[expr.ref] CD7 Class member access with prvalues Clang 20
2814[expr.static.cast] NAD Alignment requirement of incomplete class type Unknown
2815[over.ics.rank] CD7 Overload resolution for references/pointers to noexcept functions Unknown
2816[intro.progress] review Unclear phrasing "may assume ... eventually" Not resolved
2817[expr.sizeof] open sizeof(abstract class) is underspecified Not resolved
2818[lex.name] CD7 Use of predefined reserved identifiers Unknown
2819[expr.const] CD7 Cast from null pointer value in a constant expression Clang 19 (C++26 onwards)
2820[dcl.init.general] CD7 Value-initialization and default constructors Unknown
2821[basic.life] review Lifetime, zero-initialization, and dynamic initialization Not resolved
2822[basic.stc.general] CD7 Side-effect-free pointer zap Unknown
2823[expr.unary.op] CD7 Implicit undefined behavior when dereferencing pointers No
2824[dcl.init.general] CD7 Copy-initialization of arrays Unknown
2825[stmt.ranged] CD7 Range-based for statement using a braced-init-list Unknown
2826[class.temporary] drafting Missing definition of "temporary expression" Not resolved
2827[basic.fundamental] review Representation of unsigned integral types Not resolved
2828[expr.cast] CD7 Ambiguous interpretation of C-style cast Unknown
2829[over.best.ics.general] open Redundant case in restricting user-defined conversion sequences Not resolved
2830[dcl.init.list] CD7 Top-level cv-qualification should be ignored for list-initialization Unknown
2831[dcl.decl.general] CD7 Non-templated function definitions and requires-clauses Unknown
2832[class.temporary] open Invented temporary variables and temporary objects Not resolved
2833[basic.start.dynamic] review Evaluation of odr-use Not resolved
2834[temp.func.order] review Partial ordering and explicit object parameters Not resolved
2835[basic.scope.scope] open Name-independent declarations Not resolved
2836[conv.rank] CD7 Conversion rank of long double and extended floating-point types Unknown
2837[class.copy.ctor] open Instantiating and inheriting by-value copy constructors Not resolved
2838[basic.scope.block] open Declaration conflicts in lambda-expressions Not resolved
2839[class.dtor] open Explicit destruction of base classes Not resolved
2840[basic.align] open Missing requirements for fundamental alignments Not resolved
2841[class.ctor.general] open When do const objects start being const? Not resolved
2842[over.ics.rank] open Preferring an initializer_list over a single value Not resolved
2843[intro.refs] CD7 Undated reference to Unicode makes C++ a moving target Unknown
2844[over.match.oper] open Enumerating a finite set of built-in candidates Not resolved
2845[expr.prim.lambda.closure] CD7 Make the closure type of a captureless lambda a structural type Unknown
2846[dcl.fct] CD7 Out-of-class definitions of explicit object member functions Unknown
2847[temp.expl.spec] review Constrained explicit specializations of function templates at class scope @@ -16942,228 +19746,266 @@

C++ defect report implementation status

2848[temp.explicit] CD7 Omitting an empty template argument list for explicit instantiation Unknown
2849[class.temporary] CD7 Parameter objects are not temporary objects Unknown
2850[basic.stc] CD7 Unclear storage duration for function parameter objects Unknown
2851[expr.const] CD7 Allow floating-point conversions in converted constant expressions Unknown
2852[class.mem.general] open Complete-class contexts and class-scope lambdas Not resolved
2853[expr.add] CD7 Pointer arithmetic with pointer to hypothetical element Unknown
2854[except.throw] CD7 Storage duration of exception objects Unknown
2855[expr.post.incr] CD7 Undefined behavior in postfix increment Unknown
2856[over.match.list] CD7 Copy-list-initialization with explicit default constructors Unknown
2857[basic.lookup.argdep] CD7 Argument-dependent lookup with incomplete class types No
2858[expr.prim.id.qual] CD7 Declarative nested-name-specifiers and pack-index-specifiers Clang 19
2859[dcl.init.general] CD7 Value-initialization with multiple default constructors Unknown
2860[basic.life] dup Remove and fix the term "vacuous initialization" Unknown
2861[expr.dynamic.cast] CD7 dynamic_cast on bad pointer value Unknown
2862[temp.pre] review Unclear boundaries of template declarations Not resolved
2863[basic.life] drafting Unclear synchronization requirements for object lifetime rules Not resolved
2864[dcl.init.list] CD7 Narrowing floating-point conversions Unknown
2865[expr.cond] CD7 Regression on result of conditional operator Unknown
2866[dcl.attr] open Observing the effects of [[no_unique_address]] Not resolved
2867[dcl.struct.bind] CD7 Order of initialization for structured bindings Unknown
2868[class.temporary] open Self-references in trivially copyable objects as function return values Not resolved
2869[expr.prim.this] CD7 this in local classes Unknown
2870[lex.string] CD7 Combining absent encoding-prefixes Unknown
2871[class.default.ctor] CD7 User-declared constructor templates inhibiting default constructors Unknown
2872[basic.link] CD7 Linkage and unclear "can be referred to" Unknown
2873[over.over] open Taking the address of a function involving template argument deduction Not resolved
2874[dcl.type.elab] CD7 Qualified declarations of partial specializations Unknown
2875[diff.expr] tentatively ready Missing support for round-tripping null pointer values through indirection/address operators Not resolved
2876[dcl.fct.def.general] CD7 Disambiguation of T x = delete("text") Unknown
2877[enum.udecl] CD7 Type-only lookup for using-enum-declarator Clang 19
2878[expr.cast] open C-style casts to reference types Not resolved
2879[expr.const.cast] CD7 Undesired outcomes with const_cast Unknown
2880[expr.delete] CD7 Accessibility check for destructor of incomplete class type Unknown
2881[expr.prim.lambda.closure] CD7 Type restrictions for the explicit object parameter of a lambda Clang 19
2882[expr.static.cast] CD7 Unclear treatment of conversion to void Clang 2.7
2883[basic.def.odr] CD7 Definition of "odr-usable" ignores lambda scopes No
2884[dcl.type.elab] dup Qualified declarations of partial specializations Unknown
2885[class.default.ctor] review Non-eligible trivial default constructors @@ -17174,192 +20016,224 @@

C++ defect report implementation status

2886[class.temporary] CD7 Temporaries and trivial potentially-throwing special member functions Clang 9
2887[diff.cpp03.expr] CD7 Missing compatibility entries for xvalues Unknown
2888[basic.lookup.argdep] review Missing cases for reference and array types for argument-dependent lookup Not resolved
2889[expr.delete] open Requiring an accessible destructor for destroying operator delete Not resolved
2890[class.local] CD7 Defining members of local classes Unknown
2891[implimits] CD7 Normative status of implementation limits Unknown
2892[expr.arith.conv] CD7 Unclear usual arithmetic conversions Unknown
2893[temp.inst] NAD Instantiations in discarded if constexpr substatements Unknown
2894[expr.type.conv] CD7 Functional casts create prvalues of reference type Unknown
2895[dcl.init.general] CD7 Initialization should ignore the destination type's cv-qualification Unknown
2896[temp.deduct] review Template argument deduction involving exception specifications Not resolved
2897[class.copy.assign] open Copying potentially-overlapping union subobjects Not resolved
2898[over.best.ics.general] CD7 Clarify implicit conversion sequence from cv T to T Unknown
2899[conv.lval] CD7 Bad value representations should cause undefined behavior Unknown
2900[temp.deduct.type] open Deduction of non-type template arguments with placeholder types Not resolved
2901[basic.lval] CD7 Unclear semantics for near-match aliased access Unknown
2902[expr.prim.id.general] review Implicit this transformation outside of permitted contexts Not resolved
2903[temp.names] drafting Can we omit the template disambiguator in nested-name-specifiers in type-only contexts? Not resolved
2904[temp.pre] open Introducing template-names Not resolved
2905[temp.dep.constexpr] CD7 Value-dependence of noexcept-expression Unknown
2906[expr.cond] CD7 Lvalue-to-rvalue conversion of class types for conditional operator Unknown
2907[expr.const] CD7 Constant lvalue-to-rvalue conversion on uninitialized std::nullptr_t Unknown
2908[cpp.line] CD7 Counting physical source lines for __LINE__ Unknown
2909[expr.const] CD7 Subtle difference between constant-initialized and constexpr Unknown
2910[basic.def.odr] CD7 Effect of requirement-parameter-lists on odr-usability Unknown
2911[expr.prim.req.general] CD7 Unclear meaning of expressions "appearing within" subexpressions Unknown
2912[expr.new] open Too-large value for size in array new Not resolved
2913[temp.deduct.guide] CD7 Grammar for deduction-guide has requires-clause in the wrong position Clang 20
2914[basic.start.static] review Unclear order of initialization of static and thread-local variables Not resolved
2915[dcl.fct] CD7 Explicit object parameters of type void Clang 20
2916[temp.spec.partial] review Variable template partial specializations should not be declared static Not resolved
2917[temp.pre] review Disallow multiple friend-type-specifiers for a friend template @@ -17370,528 +20244,616 @@

C++ defect report implementation status

2918[over.over] CD7 Consideration of constraints for address of overloaded function Clang 21
2919[over.match.ref] CD7 Conversion function candidates for initialization of const lvalue reference Unknown
2920[temp.names] open The template keyword for base classes Not resolved
2921[module.interface] CD7 Exporting redeclarations of entities not attached to a named module Unknown
2922[expr.const] CD7 constexpr placement-new is too permissive Clang 20
2923[intro.progress] tentatively ready Note about infinite loops and execution steps Not resolved
2924[defns.undefined] CD7 Undefined behavior during constant evaluation Unknown
2925[expr.delete] NAD Deleting a pointer to an incomplete enumeration type Unknown
2926[basic.lookup.qual.general] drafting Lookup context for dependent qualified names Not resolved
2927[cpp.pre] CD7 Unclear status of translation unit with module keyword Unknown
2928[basic.start.dynamic] open No ordering for initializing thread-local variables Not resolved
2929[basic.start.term] review Lifetime of trivially-destructible static or thread-local objects Not resolved
2930[class.copy.elision] CD7 Unclear term "copy/move operation" in specification of copy elision Unknown
2931[over.oper.general] CD7 Restrictions on operator functions that are explicit object member functions Unknown
2932[dcl.enum] review Value range of empty enumeration Not resolved
2933[expr.type] CD7 Dangling references Unknown
2934[dcl.fct.def.coroutine] open Unclear semantics of exception escaping from unhandled_exception Not resolved
2935[dcl.fct.def.coroutine] open Destroying the coroutine state when initial-await-resume-called is false Not resolved
2936[temp.dep.type] CD7 Local classes of templated functions should be part of the current instantiation Unknown
2937[lex.phases] CD7 Grammar for preprocessing-file has no normative effect Unknown
2938[basic.link] open Inheriting linkage from a previous declaration Not resolved
2939[expr.reinterpret.cast] CD7 Do not allow reinterpret_cast from prvalue to rvalue reference Unknown
2940[intro.object] review Definition of "object" Not resolved
2941[class.temporary] open Lifetime extension for function-style cast to reference type Not resolved
2942[dcl.fct] open Packs in a function's parameter-type-list Not resolved
2943[dcl.attr.nodiscard] CD7 Discarding a void return value Unknown
2944[expr.throw] CD7 Unsequenced throw-expressions Unknown
2945[basic.link] open Redundant constraints on matching function template declarations Not resolved
2946[temp.over.link] open Dependent call equivalence in non-ADL cases Not resolved
2947[cpp.module] open Limiting macro expansion in pp-module Not resolved
2948[temp.spec.partial.general] open Late ambiguity for partial template specialization Not resolved
2949[temp.func.order] open Treatment of ellipsis during partial ordering Not resolved
2950[class.bit] open Value preservation in enumeration vs. integer bit-fields Not resolved
2951[temp.decls.general] open Distinguishing a primary template Not resolved
2952[basic.life] open Vacuous initialization for subobjects Not resolved
2953[basic.types.general] open Value representation for non-trivially-copyable types Not resolved
2954[intro.races] NAD Simultaneous modifications of an atomic object Unknown
2955[intro.execution] open Unify rules about conflicting unordered accesses Not resolved
2956[basic.lookup.qual.general] open Missing allowance for pseudo-destructors in qualified lookup Not resolved
2957[expr.ref] open Evaluating a reference member should constitute access Not resolved
2958[over.ics.rank] open Overload resolution involving lvalue transformation and qualification conversion Not resolved
2959[expr.ref] open Naming enumerators in class member access expressions Not resolved
2960[basic.life] open Introduce discontiguous object lifetime Not resolved
2961[temp.constr] open Checking of ill-formed types in constraint-expressions Not resolved
2962[expr.const] open Evaluation of destructor call for variable with constant destruction Not resolved
2963[stmt.ambig] open Paradoxical variable-or-function declaration Not resolved
2964[conv.lval] open Reading "invalid pointer values" Not resolved
2965[basic.scope.temp] open Generic lambdas do not have a template parameter scope Not resolved
2966[basic.fundamental] open Alignment and value representation of std::nullptr_t Not resolved
2967[over.match.ref] open Explicit conversion functions Not resolved
2968[basic.lookup.general] open Name lookup result for typedef-name vs. class-name Not resolved
2969[basic.scope] open Scopes in the function-try-block of a constructor Not resolved
2970[intro.races] CD7 Races with volatile sig_atomic_t bit-fields Unknown
2971[module.global.frag] open Specializations for a class are not decl-reachable Not resolved
2972[expr.prim.id.qual] open Declarative nested-name-specifier naming a partial specialization Not resolved
2973[dcl.typedef] open Does an alias-declaration introduce a name for linkage purposes? Not resolved
2974[temp.deduct.type] open Non-deduced context for qualified-id naming a template Not resolved
2975[temp.constr.normal] open Effect of concept template-head on parameter mappings Not resolved
2976[stmt.dcl] review Transferring control out of a function Not resolved
2977[dcl.init.general] review Initialization with string literals Not resolved
2978[temp.deduct.call] open Deduction involving reference to similar types Not resolved
2979[class.mem.general] open Duplicate declarations of enumerations in class scope Not resolved
2980[temp.names] open Constraints on template template parameters Not resolved
2981[expr.arith.conv] open Usual arithmetic conversions and result types Not resolved
2982[temp.deduct.decl] CD7 Deduction in type-constraints Unknown
2983[basic.pre] review Non-type template parameters are not variables Not resolved
2984[temp.dep.constexpr] open Value-dependent structured bindings Not resolved
2985[dcl.init.ref] CD7 Unclear rules for reference initialization with conversion Unknown
2986[basic.life] open Creating objects within a mutable member of a const object Not resolved
2987[expr.static.cast] CD7 Remove dilapidated wording from static_cast Unknown
2988[basic.link] open Is a closure type from a lambda-expression appearing in a concept-definition a TU-local entity? Not resolved
2989[expr.prim.paren] open Remove misleading general allowance for parentheses Not resolved
2990[module.interface] CD7 Exporting redeclarations of namespaces Unknown
2991[dcl.init.general] open "array size" is vague Not resolved
2992[basic.pre] open Labels do not have names Not resolved
2993[dcl.fct.def.general] open Body of a destructor Not resolved
2994[temp.param] open Allowing template parameters following template parameter packs that are pack expansions Not resolved
2995[stmt.return] open Meaning of flowing off the end of a function Not resolved
2996[temp.constr.atomic] open Impenetrable definition of atomic constraint Not resolved
2997[dcl.fct.def.default] open Defaulted functions with deleted definition Not resolved
2998[temp.deduct.partial] open Missing deduction consistency check for partial ordering Not resolved
2999[class.default.ctor] open Trivial unions changing existing behavior Not resolved
3000[expr.cond] review Handling of cv-qualified class types in conditional operator Not resolved
3001[basic.life] tentatively ready Inconsistent restrictions for static_cast on pointers to out-of-lifetime objects Not resolved
3002[temp.dep.temp] tentatively ready Template parameter/argument confusion Not resolved
3003[dcl.type.simple] review Naming a deducible template for class template argument deduction Not resolved
3004[expr.const] tentatively ready Pointer arithmetic on array of unknown bound Not resolved
3005[basic.scope.scope] tentatively ready Function parameters should never be name-independent @@ -17902,570 +20864,665 @@

C++ defect report implementation status

3006[temp.explicit] review Vague restrictions for explicit instantiations of class templates Not resolved
3007[class.compare.default] open Access checking during synthesis of defaulted comparison operator, take 2 Not resolved
3008[diff.dcl] tentatively ready Missing Annex C entry for void object declarations Not resolved
3009[expr.const] open Unclear rules for constant initialization Not resolved
3010[expr.const] open constexpr placement-new should require transparent replaceability Not resolved
3011[expr.new] tentatively ready Parenthesized aggregate initialization for new-expressions Not resolved
3012[dcl.constexpr] open Deviating constexpr or consteval across translation units Not resolved
3013[cpp.embed.gen] CD7 Disallowing macros for #embed parameters Unknown
3014[cpp.embed.gen] CD7 Comma-delimited vs. comma-separated output for #embed Unknown
3015[cpp.include] CD7 Handling of header-names for #include and #embed Unknown
3016[cpp.cond] CD7 Satisfying the syntactic requirements of #include and #embed Unknown
3017[cpp.cond] open Commas in controlling expression of conditional inclusion Not resolved
3018[cpp.cond] CD7 Validity of defined in __has_embed Unknown
3019[lex.header] open Restrictions on character sequences in header-names Not resolved
3020[cpp.cond] CD7 Missing specification for __has_cpp_attribute(indeterminate) Unknown
3021[temp.constr.order] drafting Subsumption rules for fold expanded constraints Not resolved
3022[class.dtor] review Redundant specification of explicit destructor calls Not resolved
3023[dcl.init.list] open Default arguments in list-initialization Not resolved
3024[dcl.align] open Alignment of references Not resolved
3025[basic.stc.dynamic.deallocation] open Deallocation functions returning void Not resolved
3026[expr.unary.op] open Class for pointer-to-member formation Not resolved
3027[temp.type] open Equivalence of pack-index-specifiers Not resolved
3028[namespace.udecl] open A using-declarator should bind a name Not resolved
3029[basic.align] drafting Confusing note about ordinary character types for aligned memory areas Not resolved
3030[dcl.array] open Initializing array prvalues of unknown bound Not resolved
3031[over.match.funcs.general] open Finding declarations for conversion operators for access checking Not resolved
3032[temp.arg.general] tentatively ready Template argument disambiguation Not resolved
3033[basic.scope.namespace] open Scope after declarator-id before determining correspondence Not resolved
3034[temp.inst] open Infinite recursion should hit an implementation limit Not resolved
3035[class.union.anon] open Lambda expressions in anonymous unions Not resolved
3036[basic.extended.fp] open Extended floating-point types should not be cv-qualified Not resolved
3037[namespace.udecl] open Name lookup results for using-declarators Not resolved
3038[dcl.attr.grammar] open Ignorability of attributes, again Not resolved
3039[intro.object] open Undefined behavior from implicit object creation ignores observable checkpoints Not resolved
3040[dcl.fct.def.coroutine] open Mishandling of lambda coroutines Not resolved
3041[class.dtor] open Overly aggressive rule for deleting the destructor of a union Not resolved
3042[basic.lval] open Implicit object creation is insufficient to model effective type rule of C Not resolved
3043[class.temporary] open Lifetime extension for temporaries in expansion statements Not resolved
3044[stmt.expand] tentatively ready Iterating expansion statements woes Not resolved
3045[basic.scope.block] tentatively ready Regularizing environment interactions of expansion statement Not resolved
3046[dcl.enum] open Enumerations as part of the common initial sequence Not resolved
3047[basic.life] open Calling destructors on out-of-lifetime objects Not resolved
3048[stmt.expand] tentatively ready Empty destructuring expansion statements Not resolved
3049[class.prop] open Implicitly deleted move operation should not disable trivial relocation Not resolved
3050[dcl.attr.deprecated] open [[deprecated]] for class template partial specializations Not resolved
3051[class.mem.general] open Missing specification for types of member subobjects Not resolved
3052[stmt.return] open Unclear handling of checks on discarded return statements Not resolved
3053[cpp.replace.general] tentatively ready Allowing #undef likely Not resolved
3054[expr.call] open Use of default arguments depending on shape of postfix-expression in a function call Not resolved
3055[over.call.object] open Misleading body for surrogate call function Not resolved
3056[expr.prim.req.type] open Missing semicolons in grammar for type-requirement Not resolved
3057[over.ics.ref] open Ranking of derived-to-base conversions should ignore reference binding Not resolved
3058[basic.lookup.general] open "Program point" is not defined Not resolved
3059[expr.const] open throw; in constant expressions Not resolved
3060[basic.start.main] open Change in behavior for noexcept main Not resolved
3061[stmt.expand] tentatively ready Trailing comma in an expansion-init-list Not resolved
3062[dcl.fct.default] open Overlapping specification of default template arguments Not resolved
3063[class.temporary] open Lifetime extension of temporaries past function return Not resolved
3064[basic.life] open Mishandling of placement-new in lifetime rules Not resolved
3065[basic.types.general] open Reachability and completeness of types Not resolved
3066[expr.prim.id.qual] tentatively ready Declarative nested-name-specifier in explicit instantiation Not resolved
3067[conv.array] open Array-to-pointer conversion with object type mismatch Not resolved
3068[class.access.general] open Access checking in friends involving qualified-ids Not resolved
3069[temp.constr.normal] open Reference to wrong placeholder Not resolved
3070[class.copy.assign] open Trivial assignment can skip member subobjects Not resolved
3071[dcl.struct.bind] open Negative tuple_size in structured bindings Not resolved
3072[temp.deduct.general] open Incorrect examples for lambda SFINAE Not resolved
3073[over.match.ref] open Dependence of R on T2 is unclear Not resolved
3074[cpp.module] tentatively ready Redundant ill-formedness for module macros Not resolved
3075[cpp.import] tentatively ready Unclear matching of import directive Not resolved
3076[cpp.include] tentatively ready Remove unnecessary IFNDR for malformed header-name-tokens Not resolved
3077[cpp.pre] tentatively ready Undesirable formation of import directive with string-literal Not resolved
3078[cpp.include] review Different treatment of #include pp-tokens and header-name-tokens Not resolved
3079[class.union.anon] open Allow empty-declarations in anonymous unions Not resolved
3080[temp.arg.template] tentatively ready Clarify kinds of permitted template template arguments Not resolved
3081[expr.ref] review Require glvalue when splicing direct base class relationship Not resolved
3082[expr.reinterpret.cast] tentatively ready Allow for call-compatible function types in reinterpret_cast Not resolved
3083[stmt.pre] tentatively ready Remove redundant restrictions on class and enum definitions Not resolved
3084[stmt.cont] tentatively ready compound-statements inside iteration-statements Not resolved
3085[stmt.pre] tentatively ready Apply restriction inside for-range-declaration Not resolved
3086[cpp.pragma.op] tentatively ready Destringizing should consider all sorts of encoding-prefixes Not resolved
3087[cpp.pragma.op] open Destringizing for raw string literals Not resolved
3088[cpp.replace.general] open Clarify macro treatment of identifiers with special meaning Not resolved
3089[dcl.init.general] tentatively ready const-default-constructible improperly handles std::meta::info Not resolved
3090[module.interface] tentatively ready Internal linkage from header units Not resolved
3091[basic.link] review Linking of translation units as sequences of tokens Not resolved
3092[dcl.attr.annotation] tentatively ready base-specifiers are not "declared" Not resolved
3093[expr.prim.splice] open Missing integration of direct base class relationships Not resolved
3094[lex.phases] review Rework phases for string literal concatenation and token formation Not resolved
3095[temp.dep.expr] open Type-dependent packs that are not structured binding packs Not resolved
3096[temp.dep.constexpr] open Value-dependence of size of structured binding pack with non-dependent initializer Not resolved
3097[basic.scope.scope] tentatively ready Lambda expression introduces a scope Not resolved
3098[temp.names] tentatively ready Remove redundancy "names or designates" Not resolved
3099[temp.inst] open Instantiation of type aliases from alias templates is unspecified Not resolved
3100[basic.start.term] open Destruction order for objects with static storage duration Not resolved